def createTagCloud(self,wordline): """ Create tag cloud image """ wordstream = [] if wordline == '': return False wordsTokens = WhitespaceTokenizer().tokenize(wordline) wordsTokens.remove(wordsTokens[0]) wordstream.append(' '.join(wordsTokens)) wordstream = ' '.join(wordstream) thresh = self.wordCount colorS = self.colorSchemes[self.color] tags = make_tags(get_tag_counts(wordstream)[:thresh],\ minsize=3, maxsize=40,\ colors = COLOR_SCHEMES[colorS]) create_tag_image(tags, self.png,\ size=(960, 400),\ background=(255, 255, 255, 255),\ layout= LAYOUT_HORIZONTAL,\ fontname='Neuton') return True
def parse(self, corpus_filename, key): assert type(corpus_filename) == str, "the filename must be a string" assert type(key) == str, "the key must be a string" wst = WhitespaceTokenizer() with codecs.open(corpus_filename, encoding="utf8") as input: corpus = [wst.tokenize(l) for l in input] return {key: corpus}
def get_texts_raw(self): """ Parse documents analogously to SimpleCorpus.get_texts(), but tokenized by whitespace only """ wst = WhitespaceTokenizer() with self.getstream() as stream: for doc in stream: yield [word for word in wst.tokenize(utils.to_unicode(doc))]
def tokenizeDoc(self, doc): """ Get the tokens (words) from the doc uses nltk. """ #print ("Tokenizing doc") tokenizer = WhitespaceTokenizer() docTokens = tokenizer.tokenize(doc) return docTokens
def tokenize_english_document(input_text): """ This is a crude tokenizer for input conversations in English. :param input_text: :return: """ end_list = [] block_tokenizer = BlanklineTokenizer() sentence_tokenizer = PunktSentenceTokenizer() word_tokenizer = WhitespaceTokenizer() # using the 38 characters in one line rule from ITV subtitle guidelines characters_per_line = 38 lines_per_subtitle = 2 blocks = block_tokenizer.tokenize(input_text) for block in blocks: # We have one speaker sentences = sentence_tokenizer.tokenize(block) # We have the sentences for sentence in sentences: words = word_tokenizer.tokenize(sentence) reverse_words = words[::-1] lines = [] current_line = '' line_full = False while reverse_words: word = reverse_words.pop() longer_line = ' '.join([current_line, word]).strip() if len(longer_line) > characters_per_line and len(current_line): # The longer line is overreaching boundaries reverse_words.append(word) line_full = True elif len(word) >= characters_per_line: # Very long words current_line = longer_line line_full = True else: current_line = longer_line if line_full: lines.append(current_line) current_line = '' line_full = False if len(lines) >= lines_per_subtitle: end_list.append(lines) lines = [] if current_line: lines.append(current_line) if lines: end_list.append(lines) return end_list
def _calculate_sentence_title_score(self, sentence): """Calculates a score based on how many words the sentence shares with the article title.""" title = self._remove_punctuation(self.title) sentence = self._remove_punctuation(sentence) tokenizer = WhitespaceTokenizer() tokenized_title = tokenizer.tokenize(title) tokenized_sentence = tokenizer.tokenize(sentence) common_words = set() for word in tokenized_sentence: if word in tokenized_title: common_words.add(word) score = float(len(common_words)) / len(tokenized_sentence) return SENTENCE_SCORE_WEIGHTS['title'] * score
def get_words(document): ''' Return a list of unique words in document ''' regex1 = re.compile('\W') # match non-alphanumeric regex2 = re.compile('&(#)*(\w)*;') # match html entities regex3 = re.compile('( ){2,}') # match more than 2 spaces lemmatizer = WordNetLemmatizer() tokenizer = WhitespaceTokenizer() # lowercase document, remove punctuation, and html entities document = regex3.sub(' ', regex2.sub(' ', regex1.sub(' ', document.lower()))) words = [ lemmatizer.lemmatize(word) for word in tokenizer.tokenize(document) if word not in STOPWORDS and len(word) > 2 ] return FreqDist(words)
def preprocess_article_content(text_df): print 'preprocessing article text...' # text_df is data frame from SQL query, column 'content' contains text content from each article article_list = [] # define punctuation to remove punc=set('''`~!@#$%^&*()-_=+\|]}[{;:'",<.>/?''') tokenizer = WhitespaceTokenizer() stop_words = set(stopwords.words('english')) #stemmer = SnowballStemmer('english') lemmatizer = WordNetLemmatizer() kept_rows = [] for row, article in enumerate(text_df['content']): cleaned_tokens = [] tokens = tokenizer.tokenize(article.decode('unicode-escape', 'ignore').lower()) for token in tokens: token = ''.join(ch for ch in token if ch not in punc) if token not in stop_words: if len(token) > 0 and len(token) < 20: if not token[0].isdigit() and not token[-1].isdigit(): #stemmed_token = stemmer.stem(token) lemmatized_tokens = lemmatizer.lemmatize(token) #cleaned_tokens.append(stemmed_token) cleaned_tokens.append(lemmatized_tokens) # join cleaned tokens into a string for subsequent LDA # filtering out content that is likely noise (error messages etc) if len(cleaned_tokens) > 100: article_list.append(' '.join(wd for wd in cleaned_tokens)) kept_rows.append(row) print 'preprocessed content for %d articles' % len(article_list) return article_list, kept_rows
def extract(self, corpus): from nltk.stem import WordNetLemmatizer from nltk.corpus import stopwords from nltk.tokenize import WhitespaceTokenizer exclude_words = stopwords.words('english') exclude_words.append('rt') exclude_words.append('&') tok = WhitespaceTokenizer() lem = WordNetLemmatizer() tsents = [tok.tokenize(sent) for sent in corpus] norm_words = [] for sent in tsents: for word in sent: if word.startswith('http://'): continue nword = lem.lemmatize(word.lower()) if nword not in exclude_words: norm_words.append(nword) return nltk.FreqDist(norm_words)
def CleanAndTokenize(text): # Strip URLs and replace with token "URLURLURL" r = re.compile(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+") text = re.sub(r, " URLURLURL", text) # Strip html tags soup = BeautifulSoup(text, "html.parser") for tag in soup.findAll(True): tag.replaceWithChildren() text = soup.get_text() # Normalize everything to lower case text = text.lower() # Strip line breaks and endings \r \n r = re.compile(r"[\r\n]+") text = re.sub(r, "", text) # get rid of em dashes # table = { # ord(u'\u2018') : u"'", # ord(u'\u2019') : u"'", # ord(u'\u201C') : u'"', # ord(u'\u201d') : u'"', # ord(u'\u2026') : u'', # ord(u'\u2014') : u'', # } # text = text.translate(table) # Normalize contractions # e.g. can't => can not, it's => it is, he'll => he will text = NormalizeContraction(text) # Strip punctuation (except for a few) punctuations = string.punctuation # includes following characters: !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ excluded_punctuations = ["$", "%"] for p in punctuations: if p not in excluded_punctuations: text = text.replace(p, " ") # Condense double spaces text = text.replace(" ", " ") # Tokenize the text tokenizer = WhitespaceTokenizer() text_tokens = tokenizer.tokenize(text) return text_tokens
def tokenize(sent,tokenizer_type): #tokenizer_type is [0] the tokenizer [1] the REGEX or '' tokenizer = 'not_implemented' #split on custom is the only non-nltk tokenizer if tokenizer_type == 'split_on_custom': return [sent.split(tokenizer_type[1]) for sent in sents] if tokenizer_type[0] == 'whitespace': tokenizer = WhitespaceTokenizer() if tokenizer_type[0] == 'wordpunkt': tokenizer = WordPunctTokenizer() if tokenizer_type[0] == 'regexp': tokenizer = RegexpTokenizer(tokenizer_type[1]) if tokenizer_type[0] == 'treebank': tokenizer = TreebankWordTokenizer() try: if tokenizer != "not_implemented": return tokenizer.tokenize(sent) else: return 'Tokenizer not implemented' except ValueError: #if the input is not a list of strings pass
def main(args): tokenizer = WhitespaceTokenizer() voc = set() dir = args.train_dir dir_pos = os.path.join(dir, 'pos') cnt = 0 fmt = 'Processed %d positive docs' for fname in os.listdir(dir_pos): if not fname.endswith('.txt'): continue cnt += 1 if cnt % REPORT_INTERVAL == 0: print fmt % cnt f = open(os.path.join(dir_pos, fname), 'rb') voc.update(map(lambda s: s.lower(), tokenizer.tokenize(f.read()))) f.close() print fmt % cnt dir_neg = os.path.join(dir, 'neg') cnt = 0 fmt = 'Processed %d negative docs' for fname in os.listdir(dir_neg): if not fname.endswith('.txt'): continue cnt += 1 if cnt % REPORT_INTERVAL == 0: print fmt % cnt f = open(os.path.join(dir_neg, fname), 'rb') voc.update(map(lambda s: s.lower(), tokenizer.tokenize(f.read()))) f.close() print fmt % cnt voc = sorted(list(voc)) f = open(args.output, 'wb') pickle.dump(voc, f) f.close()
def buildVocab(self): self.vocabSize = int(self.vocabSize) print ("Building vocab from frequencies") # get tokenized corpus and get word counts self.tokenizedCorpus = [] self.vocabSet = set() tokenizer = WhitespaceTokenizer() for doc in self.corpus: # tokenize doc docTokens = tokenizer.tokenize(doc) self.tokenizedCorpus.extend(docTokens) print (" Tokenized corpus = ", len(self.tokenizedCorpus)) # vocab for entire corpus self.fullVocab = set(self.tokenizedCorpus) print (" Full vocab = ", len(self.fullVocab)) self.vocabCounts = {} # Extremely inefficient since has to iterate entire corpus for each word # generate counts for each word #for w in self.fullVocab: # self.vocabCounts[w] = self.tokenizedCorpus.count(w) # for each word in corpus for w in self.tokenizedCorpus: if w in self.vocabCounts: self.vocabCounts[w] += 1 else: self.vocabCounts[w] = 1 # sort counts with most frequent first sortedCounts = sorted(self.vocabCounts.items(), key=operator.itemgetter(1), reverse=True) # generate vocab from first vocabSize words vocabCounts = sortedCounts[0:self.vocabSize] self.vocab = [e[0] for e in vocabCounts] print (" vocab = ", self.vocab)
def topicWordFreq(self,wordline): """ Statistic word stream frequency. """ wordstream = [] wordsTokens = WhitespaceTokenizer().tokenize(wordline) wordsTokens.remove(wordsTokens[0]) wordstream.append(' '.join(wordsTokens)) wordstream = ' '.join(wordstream) #print wordstream fdist = FreqDist() for word in word_tokenize(wordstream): fdist.inc(word) result = [list(item) for item in fdist.items()] num = float(fdist.N()) result = [[val[0],val[1],val[1]/num] for val in result] #print "smaples:" #print fdist.items() #print fdist.keys() return result
def bag_of_words(voc, doc, handle_negation=False, handle_bigrams=False): """ Generate bag of words according to dictionary. Haven'd done sanity check on dictionary. Please make each word in dictionary unique and sorted. :param voc: list of words :param doc: string :return: list of feature vector. 0 as not appearing. 1 as appearing positive. -1 as appearing negative. Has the same size of dictionary. """ tokenizer = WhitespaceTokenizer() tokens = tokenizer.tokenize(doc) fv = np.zeros_like(voc, np.int8) is_previous_negative = False is_previous_enhanced = False for token in tokens: word = token.lower() if is_skip_word(word): continue if is_negative(word): is_previous_negative = True continue if is_degree(word): is_previous_enhanced = True continue try: idx = voc.index(word) fv[idx] = 1 fv[idx] *= -1 if handle_negation and is_previous_negative else 1 fv[idx] *= 2 if handle_bigrams and is_previous_enhanced else 1 except ValueError, e: pass is_previous_negative = False is_previous_enhanced = False
def __init__(self, mongo_db, postgre_db, sentence_mode=True, punctuation_mode=False, window_size=0): """Initialize a prototype with a specified configurations. Parameters: mongo_db -- Mongo DB connection postgre_db -- PostGre DB connection sentence_mode -- whether or not to use sentence window mode (default True) window_size -- the size of the sentence or word window (default 0) """ self.__mongo_db = mongo_db self.__postgre_db = postgre_db self.__sentence_mode = sentence_mode self.___punctuation_mode = punctuation_mode self.__window_size = window_size self.tokenizer = WhitespaceTokenizer() self.parser = Parser()
import string import json import io import sys # problem with conjugation # need hist for that too without decomposing them # but now works ic_dict = {} cong = [] all_tokens = 0 #create IC dict #tokenizer = RegexpTokenizer(r'\w+') tokenizer = WhitespaceTokenizer() filename = "../Subtlex.US.txt" for line in open(filename,"r").readlines(): line = line.lower() line = line.strip() #line = line.replace("-"," ") #line = "self-support" line = ' '.join(word.strip(string.punctuation) for word in line.split()) print tokenizer.tokenize(line) t_list = tokenizer.tokenize(line) for token in t_list: try: token = token.encode("ascii", "ignore").lower() #token = unicode(token, 'utf8') #token = token.encode('utf8')
# Write your code here from nltk.tokenize import WhitespaceTokenizer from nltk import bigrams, trigrams from collections import Counter from random import choices, choice import re filename = input() tk = WhitespaceTokenizer() with open(filename, 'r', encoding='utf-8') as fc: cops = fc.readlines() cops1 = " ".join(cops) copsx = tk.tokenize(cops1) tricops = list(trigrams(copsx)) bicops = list(bigrams(copsx)) tricop_dict = {} for head1, head2, tail in tricops: head = head1 + " " + head2 tricop_dict.setdefault(head, []).append(tail) regex1 = r'^[A-Z][a-z\']*?$' regex2 = r'^[A-z\'-]*?$' regex3 = r'^[A-z\'-]*?[.!?]$' regex4 = r'^[A-Z][A-z\' ]*?$' new_elements = '' for _xelem in range(len(tricop_dict)): start1 = choice(tricops) if re.match(regex1, start1[0]) and re.match(regex2, start1[1]): new_elements = start1[0] + " " + start1[1]
def reducer(self, sentence, docnames): tokens = WhitespaceTokenizer().tokenize(sentence) yield (', '.join(docnames) + ': ' + sentence + ' --> ', tokens)
import emoji from emoji import UNICODE_EMOJI import unicodedata import num2words import pandas import string from nltk.tokenize import WhitespaceTokenizer from nltk.corpus import stopwords from textblob import TextBlob, Word from operator import add from itertools import starmap from nltk.stem import LancasterStemmer lancaster=LancasterStemmer() stopWords = set(stopwords.words('english')) word = WhitespaceTokenizer() class TextSetup: def __init__(self, text): self.t = text self.text = self.t self.dtype_str = isinstance(self.text, str) self.dtype_list = isinstance(self.text, list) self.dtype_pd_series = isinstance(self.text, pandas.core.series.Series) if self.dtype_pd_series: self.t = text.tolist() self.text = self.t class SwachhText(TextSetup): def __init__(self, TextSetup): self.t = TextSetup.t
import re from nltk.tokenize import WhitespaceTokenizer # LOAD csv into dataframe colnames = ['author', 'title', 'date', 'length', 'text'] df = pandas.read_csv('../data/talks_3.csv', names=colnames) talks = df.text.tolist() # All this for labels authors = df.author.tolist() dates = df.date.tolist() years = [re.sub('[A-Za-z ]', '', item) for item in dates] authordate = [author + " " + year for author, year in zip(authors, years)] # TOKENIZE tokenizer = WhitespaceTokenizer() texts = [] for talk in talks: raw = re.sub(r"[^\w\d'\s]+", '', talk).lower() tokens = tokenizer.tokenize(raw) texts.append(tokens) # =-=-=-=-=-=-=-=-=-=-= # Small Test Corpus # =-=-=-=-=-=-=-=-=-=-= test = texts[0:5] # =-=-=-=-=-=-=-=-=-=-= # Function to collect word positions within a text (as a word list) # =-=-=-=-=-=-=-=-=-=-=
mwe_tokenizer.tokenize(sentence5.split()) # 'For', 'more', '@indian_army', # Indian Army' should be treated as a single token. But here "Army!" is treated as a token. mwe_tokenizer.tokenize(sentence5.replace('!','').split()) # "Army!" will be treated as Army # 3. Regexp Tokenizer from nltk.tokenize import RegexpTokenizer reg_tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+') reg_tokenizer.tokenize(sentence5) # 4. Whitespace Tokenizer from nltk.tokenize import WhitespaceTokenizer wh_tokenizer = WhitespaceTokenizer() wh_tokenizer.tokenize(sentence5) # 5. WordPunct Tokenizer from nltk.tokenize import WordPunctTokenizer wp_tokenizer = WordPunctTokenizer() wp_tokenizer.tokenize(sentence5) # Regexp Stemmer sentence6 = "I love playing Cricket. Cricket players practice hard." from nltk.stem import RegexpStemmer regex_stemmer = RegexpStemmer('ing$') ' '.join([regex_stemmer.stem(wd) for wd in sentence6.split()])
def tokens(self): """method is used to parse the text file and to return a list of all tokens""" text = self.read_file() tk = WhitespaceTokenizer() return tk.tokenize(text)
import pymysql from nltk.tokenize import WhitespaceTokenizer connection = pymysql.connect(host="127.0.0.1", user="******", password="******", charset='utf8', db='tf-idf', cursorclass=pymysql.cursors.DictCursor) cursor = connection.cursor() terms = ['debut', 'two', 'language', 'also'] tokenizer = WhitespaceTokenizer() sql = 'SELECT * FROM wiki' cursor.execute(sql) for record in cursor.fetchall(): doc_id = record['id'] text = record['text'] for term in terms: for start, end in tokenizer.span_tokenize(text): if text[start:end].lower() == term: insert_sql = 'INSERT INTO inverted_index VALUES (%s, %s)' cursor.execute(insert_sql, (term, doc_id)) break connection.commit() connection.close()
# Stage 2. Break the dataset into bigrams from nltk.tokenize import WhitespaceTokenizer from nltk import bigrams from collections import Counter # f = open(input(), "r", encoding="utf-8") f = open(input(), "r", encoding="utf-8") # We get tokens from the corpus wtk = WhitespaceTokenizer().tokenize(f.read()) # Bigrams generates an iterator. Put type list to get the data bigr = list(bigrams(wtk)) dict_bigr = {} # We create a dictionary # Key = First value in the tuple # Value = Second value that we store as a list associated to the key for key, value in bigr: dict_bigr.setdefault(key, []).append(value) head = None while head != "exit": head = input() if head != "exit": try: print(f"Head: {head}") cad = "The requested word is not in the model. Please input another word." p = dict_bigr.setdefault(head, cad) if p == cad:
def CleanAndTokenize(text): # Strip URLs and replace with token "URLURLURL" r = re.compile( r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+" ) text = re.sub(r, " URLURLURL", text) # Strip html tags soup = BeautifulSoup(text) for tag in soup.findAll(True): tag.replaceWithChildren() text = soup.get_text() # Normalize everything to lower case text = text.lower() # Strip line breaks and endings \r \n r = re.compile(r"[\r\n]+") text = re.sub(r, "", text) table = { ord(u'\u2018'): u"'", ord(u'\u2019'): u"'", ord(u'\u201C'): u'"', ord(u'\u201d'): u'"', ord(u'\u2026'): u'', ord(u'\u2014'): u'', # get rid of em dashes } text = text.translate(table) # Normalize contractions # e.g. can't => can not, it's => it is, he'll => he will text = text.replace("can't", "can not") text = text.replace("couldn't", "could not") text = text.replace("don't", "do not") text = text.replace("didn't", "did not") text = text.replace("doesn't", "does not") text = text.replace("shouldn't", "should not") text = text.replace("haven't", "have not") text = text.replace("aren't", "are not") text = text.replace("weren't", "were not") text = text.replace("wouldn't", "would not") text = text.replace("hasn't", "has not") text = text.replace("hadn't", "had not") text = text.replace("won't", "will not") text = text.replace("wasn't", "was not") text = text.replace("can't", "can not") text = text.replace("isn't", "is not") text = text.replace("ain't", "is not") text = text.replace("it's", "it is") text = text.replace("i'm", "i am") text = text.replace("i'm", "i am") text = text.replace("i've", "i have") text = text.replace("i'll", "i will") text = text.replace("i'd", "i would") text = text.replace("we've", "we have") text = text.replace("we'll", "we will") text = text.replace("we'd", "we would") text = text.replace("we're", "we are") text = text.replace("you've", "you have") text = text.replace("you'll", "you will") text = text.replace("you'd", "you would") text = text.replace("you're", "you are") text = text.replace("he'll", "he will") text = text.replace("he'd", "he would") text = text.replace("he's", "he has") text = text.replace("she'll", "she will") text = text.replace("she'd", "she would") text = text.replace("she's", "she has") text = text.replace("they've", "they have") text = text.replace("they'll", "they will") text = text.replace("they'd", "they would") text = text.replace("they're", "they are") text = text.replace("that'll", "that will") text = text.replace("that's", "that is") text = text.replace("there's", "there is") # Strip punctuation (except for a few) punctuations = string.punctuation # includes following characters: !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ excluded_punctuations = ["$", "%"] for p in punctuations: if p not in excluded_punctuations: text = text.replace(p, " ") # Condense double spaces text = text.replace(" ", " ") # Tokenize the text # NOTE: Using a simple tokenizer based on spaces ... # Could also try a more sophisticated tokenizer if abbreviations / contractions should be conserved tokenizer = WhitespaceTokenizer() text_tokens = tokenizer.tokenize(text) return text_tokens
def get_corpus_stats(text_content): return WhitespaceTokenizer().tokenize(text_content)
target_vect = [] for line0 in tarin_label: target_vect.append(line0.strip()) dic_list = OrderedDict() for line1 in Dictionary_txt: dic_list[line1.strip()] = 0 preprocessed.write(line1.strip() + ",") preprocessed.write("\n") temp_dic = OrderedDict() count = 0 for line in train_txt: feature_vect = [] temp_dic = copy.copy(dic_list) token = WhitespaceTokenizer().tokenize(line) for wrd in token: if wrd in temp_dic: #print(wrd) temp_dic[wrd] = 1 for key in temp_dic: feature_vect.append(temp_dic[key]) temp_dic.clear() feature_vect.append(target_vect[count]) print(feature_vect) str1 = ','.join(str(e) for e in feature_vect) preprocessed.write(str1) preprocessed.write("\n") count += 1 print(len(target_vect))
def CleanAndTokenize(text): # Strip URLs and replace with token "URLURLURL" r = re.compile(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+") text = re.sub(r, " URLURLURL", text) # Strip html tags soup = BeautifulSoup(text) for tag in soup.findAll(True): tag.replaceWithChildren() text = soup.get_text() # Normalize everything to lower case text = text.lower() # Strip line breaks and endings \r \n r = re.compile(r"[\r\n]+") text = re.sub(r, "", text) table = { ord(u'\u2018') : u"'", ord(u'\u2019') : u"'", ord(u'\u201C') : u'"', ord(u'\u201d') : u'"', ord(u'\u2026') : u'', ord(u'\u2014') : u'', # get rid of em dashes } text = text.translate(table) # Normalize contractions # e.g. can't => can not, it's => it is, he'll => he will text = text.replace("can't", "can not") text = text.replace("couldn't", "could not") text = text.replace("don't", "do not") text = text.replace("didn't", "did not") text = text.replace("doesn't", "does not") text = text.replace("shouldn't", "should not") text = text.replace("haven't", "have not") text = text.replace("aren't", "are not") text = text.replace("weren't", "were not") text = text.replace("wouldn't", "would not") text = text.replace("hasn't", "has not") text = text.replace("hadn't", "had not") text = text.replace("won't", "will not") text = text.replace("wasn't", "was not") text = text.replace("can't", "can not") text = text.replace("isn't", "is not") text = text.replace("ain't", "is not") text = text.replace("it's", "it is") text = text.replace("i'm", "i am") text = text.replace("i'm", "i am") text = text.replace("i've", "i have") text = text.replace("i'll", "i will") text = text.replace("i'd", "i would") text = text.replace("we've", "we have") text = text.replace("we'll", "we will") text = text.replace("we'd", "we would") text = text.replace("we're", "we are") text = text.replace("you've", "you have") text = text.replace("you'll", "you will") text = text.replace("you'd", "you would") text = text.replace("you're", "you are") text = text.replace("he'll", "he will") text = text.replace("he'd", "he would") text = text.replace("he's", "he has") text = text.replace("she'll", "she will") text = text.replace("she'd", "she would") text = text.replace("she's", "she has") text = text.replace("they've", "they have") text = text.replace("they'll", "they will") text = text.replace("they'd", "they would") text = text.replace("they're", "they are") text = text.replace("that'll", "that will") text = text.replace("that's", "that is") text = text.replace("there's", "there is") # Strip punctuation (except for a few) punctuations = string.punctuation # includes following characters: !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ excluded_punctuations = ["$", "%"] for p in punctuations: if p not in excluded_punctuations: text = text.replace(p, " ") # Condense double spaces text = text.replace(" ", " ") # Tokenize the text # NOTE: Using a simple tokenizer based on spaces ... # Could also try a more sophisticated tokenizer if abbreviations / contractions should be conserved tokenizer = WhitespaceTokenizer() text_tokens = tokenizer.tokenize(text) return text_tokens
import logging import re import string from unidecode import unidecode import numpy as np from affiliation_parser.keywords import * from affiliation_parser.data_processor import us_cities, us_state_cities_map, us_city_pop_map from nltk.tokenize import WhitespaceTokenizer w_tokenizer = WhitespaceTokenizer() punct_re = re.compile("[{}]".format(re.escape(string.punctuation))) US_CITIES = us_cities() US_CITIES_SET = set(US_CITIES) US_CITIES_TOP_2000 = set(US_CITIES[:1000]) US_CITIES_POP_MAP = us_city_pop_map() US_STATE_CITY_MAP = us_state_cities_map() MAX_WORDS = max(len(s.split()) for s in US_CITIES) def string_steps(s: str, max_size=MAX_WORDS): string_words = s.upper().replace(',', '').replace('.', '').split() final_set = set([]) for step in range(1, max_size + 1): for start in range(len(string_words)): final_set.add(" ".join(string_words[start:start + step])) if start + step > len(string_words): break if step > len(string_words): break
class InvertedIndex: ''' Main Inverted-Index structure''' def __init__(self): self._tokenizer = WhitespaceTokenizer() self._index_cache = IndexCache() self._stop_words = set(stopwords.words('english')) self._stemmer = SnowballStemmer("english") self._max_documents_per_shard = 50000 self._num_documents_in_current_shard = 0 if os.path.isfile("index_data/index.meta"): self._num_documents_in_current_shard = pickle.load( open("index_data/index.meta")) def search(self, query): combined_results = None ret_results = None for i in range(0, len(query), 2): op = query[i] keyword = self._stemmer.stem(query[i + 1].strip( string.punctuation)) keyword_results = self._search_keyword(keyword) if combined_results: if op == "AND": combined_results = combined_results.intersection( set(keyword_results.keys())) elif op == "OR": combined_results = combined_results.union( set(keyword_results.keys())) else: return {"status": False, "message": "Malformed query"} for doc in ret_results.keys(): if doc not in combined_results: del ret_results[doc] elif keyword_results.get(doc): ret_results[doc].union(keyword_results[doc]) for doc in keyword_results: if doc not in ret_results: ret_results[doc] = keyword_results[doc] else: combined_results = set(keyword_results.keys()) ret_results = keyword_results result_counts = dict() for el in ret_results: result_counts[el] = len(ret_results[el]) sorted_result_counts = sorted(result_counts.items(), key=operator.itemgetter(1), reverse=True) sorted_results = [] for key, _ in sorted_result_counts: sorted_results.append({"key": key, "positions": ret_results[key]}) if len(sorted_results) > 0: ret = {"status": True, "results": sorted_results} else: ret = {"status": False, "message": "No hits"} return ret def _search_keyword(self, query): docs = self._index_cache.get(query) if not docs: return dict() return docs def add(self, key, text): self._num_documents_in_current_shard += 1 if self._num_documents_in_current_shard > self._max_documents_per_shard: self._num_documents_in_current_shard = 0 self._index_cache.create_new_shard() token_positions = self._tokenizer.span_tokenize(text) for pos in token_positions: start_pos = pos[0] end_pos = pos[1] token = text[start_pos:end_pos].lower() if token in self._stop_words: continue token = token.strip(string.punctuation) token = self._stemmer.stem(token) if len(token) > 0: self._index_cache.add(token, key, (start_pos, end_pos)) def delete(self, key, text): pass def save(self): pickle.dump(self._num_documents_in_current_shard, open("index_data/index.meta", "wb")) self._index_cache.flush()
import nltk from nltk.tokenize import WhitespaceTokenizer, TreebankWordTokenizer, WordPunctTokenizer PHRASE = 'I hadn\'t taken my breakfast before I came to Sharan\'s class' white_space = WhitespaceTokenizer() tree_bank_word = TreebankWordTokenizer() word_punct = WordPunctTokenizer() print("WhitespaceTokenizer : ", white_space.tokenize(PHRASE)) print("TreebankWordTokenizer : ", tree_bank_word.tokenize(PHRASE)) print("WordPunctTokenizer : ", word_punct.tokenize(PHRASE))
class Plagiarism_Checker: def __init__(self, algorithm="TFIDF"): self.algorithm = algorithm self.stopWords = stopwords.words('english') self.wsTok = WhitespaceTokenizer() self.stemmer = LancasterStemmer() self.countVect = CountVectorizer() self.tfidfVect = TfidfVectorizer() self.queryData = [] self.srcData = [] def preprocess(self, documents): processed = [] for document in documents: #1 Removing Punctuations data = document.translate( str.maketrans("'", " ", string.punctuation)) #2 Converting to Lowercase data = data.lower() #3 Tokenization data = self.wsTok.tokenize(data) #4 Removing Stop Words data = [word for word in data if not word in self.stopWords] #5 Stemming words data = [self.stemmer.stem(word) for word in data] processed.append(data) return processed def setQueryText(self, data, clearData=True): if type(data) != list: print("Error : Set Query - Datatype should be 'list'") if clearData: self.queryData = [] for d in data: self.queryData.append(d) def setSourceText(self, data, clearData=True): if type(data) != list: print("Error : Set Source - Datatype should be 'list'") if clearData: self.srcData = [] for d in data: self.srcData.append(d) def jaccardSimilarity(self, query, document): inter_l = list(set(query) & set(document)) union_l = list(set(query) or set(document)) return len(inter_l) / len(union_l) def getPlagMatrix(self, documents): if self.algorithm == "TFIDF": data = [ ','.join(str(v) for v in document) for document in documents ] tfidf = self.tfidfVect.fit_transform(data) similarityMatrix = cosine_similarity(tfidf) elif self.algorithm == "TF": data = [ ','.join(str(v) for v in document) for document in documents ] sparse_matrix = self.countVect.fit_transform(data) doc_term_matrix = sparse_matrix.todense() tf = pd.DataFrame(doc_term_matrix, columns=self.countVect.get_feature_names()) similarityMatrix = cosine_similarity(tf) else: similarityMatrix = np.zeros((len(documents), len(documents))) for i, doc1 in enumerate(documents): for j, doc2 in enumerate(documents): similarityMatrix[i][j] = self.jaccardSimilarity(doc1, doc2) return similarityMatrix def getReport(self): query = self.preprocess(self.queryData) src = self.preprocess(self.srcData) similarity = [] for q in query: documents = [q] + src sim = self.getPlagMatrix(documents)[0][1:] similarity.append(sim) return similarity
def __init__(self, config): self.config = config self.sentence_tokenizer = PunktSentenceTokenizer() self.word_tokenizer = WhitespaceTokenizer() # PunktWordTokenizer()
for f in listdir('corpus/'): if f[-4:] == ".txt" and not f in skipOver: fileName = f F = open('corpus/'+f) text = F.read() F.close() alphanum = letters+octdigits paragraphs = [s for s in text.split("\n\n") if s != "" ][:-1] numParagraphs = len(paragraphs) # average paragraph size wst = WhitespaceTokenizer() paraWordCounts = [len(wst.tokenize(p)) for p in paragraphs] # the approximate number of words in the document numWords = sum(paraWordCounts) # the average number of words per paragraph avgParagraphLen = mean(paraWordCounts) # rejoin the paragraphs text = ' '.join(paragraphs) # part of speech word list for the text text = [word for subl in [pos_tag(wt(s)) for s in st(text)] for word in subl] # remove symbols from list by checking the first character of the word
def __init__(self, name, config): """ The init method downloads the required files, loads the file associated with a given subset (train/valid/test), concatenates all sencentes and tokenizes them using NLTK's WhitespaceTokenizer. :param name: Name of the component. :param class_type: Class type of the component. :param config: Dictionary of parameters (read from configuration ``.yaml`` file). """ # Call constructor of parent classes. Task.__init__(self, name, TranslationPairs, config) # Set streams key mappings. self.key_sources = self.stream_keys["sources"] self.key_targets = self.stream_keys["targets"] # Get absolute path to data folder. self.data_folder = os.path.expanduser(self.config['data_folder']) # Get dataset. if (self.config['dataset'] is None) or (self.config['dataset'] not in ["eng-fra", "eng-pol"]): raise ConfigurationError("Task supports only 'dataset' options: 'eng-fra', 'eng-pol'") dataset = self.config['dataset'] # Get (sub)set: train/valid/test. if (self.config['subset'] is None) or (self.config['subset'] not in ['train', 'valid', 'test']): raise ConfigurationError("Task supports one 'subset' options: 'train', 'valid', 'test' ") subset = self.config['subset'] # Extract source and target language name self.lang_source = self.config['dataset'].split('-')[0] self.lang_target = self.config['dataset'].split('-')[1] # Names of files used by this task. filenames = [ self.lang_source + ".train.txt", self.lang_target + ".train.txt", self.lang_source + ".valid.txt", self.lang_target + ".valid.txt", self.lang_source + ".test.txt", self.lang_target + ".test.txt" ] # Initialize dataset if files do not exist. if not io.check_files_existence(os.path.join(self.data_folder, dataset), filenames): # Set url and source filename depending on dataset. url = "https://www.manythings.org/anki/" + self.lang_target + "-" + self.lang_source + ".zip" zipfile_name = "translate_" + self.lang_target + "_" + self.lang_source + ".zip" with tempfile.TemporaryDirectory() as tmpdirname: # Download and extract wikitext zip. io.download_extract_zip_file(self.logger, tmpdirname, url, zipfile_name) # Create train, valid, test files from the downloaded file lines = io.load_string_list_from_txt_file(tmpdirname, self.lang_target + ".txt") # Shuffle the lines random.seed(42) random.shuffle(lines) # Split english and french pairs lines_source = [self.normalizeString(l.split('\t')[0]) for l in lines] lines_target = [self.normalizeString(l.split('\t')[1]) for l in lines] # Cut dataset into train (90%), valid (5%), test (5%) files test_index = len(lines) // 20 valid_index = test_index + (len(lines) // 20) os.makedirs(os.path.join(self.data_folder, dataset), exist_ok=True) with open(os.path.join(os.path.join(self.data_folder, dataset), self.lang_source + ".test.txt"), mode='w+') as f: f.write('\n'.join(lines_source[0:test_index])) with open(os.path.join(os.path.join(self.data_folder, dataset), self.lang_target + ".test.txt"), mode='w+') as f: f.write('\n'.join(lines_target[0:test_index])) with open(os.path.join(os.path.join(self.data_folder, dataset), self.lang_source + ".valid.txt"), mode='w+') as f: f.write('\n'.join(lines_source[test_index:valid_index])) with open(os.path.join(os.path.join(self.data_folder, dataset), self.lang_target + ".valid.txt"), mode='w+') as f: f.write('\n'.join(lines_target[test_index:valid_index])) with open(os.path.join(os.path.join(self.data_folder, dataset), self.lang_source + ".train.txt"), mode='w+') as f: f.write('\n'.join(lines_source[valid_index:])) with open(os.path.join(os.path.join(self.data_folder, dataset), self.lang_target + ".train.txt"), mode='w+') as f: f.write('\n'.join(lines_target[valid_index:])) else: self.logger.info("Files {} found in folder '{}'".format(filenames, self.data_folder)) # Load the lines lines_source = io.load_string_list_from_txt_file(os.path.join(self.data_folder, dataset), self.lang_source + "."+subset+".txt") lines_target = io.load_string_list_from_txt_file(os.path.join(self.data_folder, dataset), self.lang_target + "."+subset+".txt") # Get the required sample length. self.sentence_length = self.config['sentence_length'] # Separate into src - tgt sentence pairs + tokenize tokenizer = WhitespaceTokenizer() self.sentences_source = [] self.sentences_target = [] for s_src, s_tgt in zip(lines_source, lines_target): src = tokenizer.tokenize(s_src) tgt = tokenizer.tokenize(s_tgt) # Keep only the pairs that are shorter or equal to the requested length # If self.sentence_length < 0, then give all the pairs regardless of length if (len(src) <= self.sentence_length and len(tgt) <= self.sentence_length) \ or self.sentence_length < 0: self.sentences_source += [src] self.sentences_target += [tgt] self.logger.info("Load text consisting of {} sentences".format(len(self.sentences_source))) # Calculate the size of dataset. self.dataset_length = len(self.sentences_source) # Display exemplary sample. self.logger.info("Exemplary sample:\n source: {}\n target: {}".format(self.sentences_source[0], self.sentences_target[0]))
class nltk_tokenizer(IncrementalTransform): """ a streamcorpus_pipeline IncrementalTransform that converts a chunk into a new chunk with Sentence objects generated using NLTK tokenizers """ tagger_id = "nltk_tokenizer" def __init__(self, config): self.config = config self.sentence_tokenizer = PunktSentenceTokenizer() self.word_tokenizer = WhitespaceTokenizer() # PunktWordTokenizer() def _sentences(self, clean_visible): "generate strings identified as sentences" previous_end = 0 clean_visible = clean_visible.decode("utf8") assert isinstance(clean_visible, unicode) for start, end in self.sentence_tokenizer.span_tokenize(clean_visible): ## no need to check start, because the first byte of text ## is always first byte of first sentence, and we will ## have already made the previous sentence longer on the ## end if there was an overlap. if start < previous_end: start = previous_end if start > end: ## skip this sentence... because it was eaten by ## an earlier sentence with a label continue try: label = self.label_index.find_le(end) except ValueError: label = None if label: off = label.offsets[OffsetType.BYTES] end = max(off.first + off.length, end) previous_end = end sent_str = clean_visible[start:end] yield start, end, sent_str def make_label_index(self, stream_item): "make a sortedcollection on body.labels" labels = stream_item.body.labels.get(self.config.get("annotator_id")) if not labels: labels = [] self.label_index = SortedCollection(labels, key=lambda label: label.offsets[OffsetType.BYTES].first) def make_sentences(self, stream_item): "assemble Sentence and Token objects" self.make_label_index(stream_item) sentences = [] token_num = 0 new_mention_id = 0 for sent_start, sent_end, sent_str in self._sentences(stream_item.body.clean_visible): assert isinstance(sent_str, unicode) sent = Sentence() sentence_pos = 0 for start, end in self.word_tokenizer.span_tokenize(sent_str): try: token_str = sent_str[start:end].encode("utf8") except Exception, exc: logger.critical("died on sent_str[%d:%d].encode('utf8')", start, end, exc_info=True) sys.exit("failed to cope with %r in %r" % (sent_str[start:end], sent_str)) tok = Token(token_num=token_num, token=token_str, sentence_pos=sentence_pos) tok.offsets[OffsetType.BYTES] = Offset( type=OffsetType.BYTES, first=sent_start + start, length=end - start ) ## whitespace tokenizer will never get a token ## boundary in the middle of an 'author' label try: # logger.debug('searching for %d in %r', sent_start + start, self.label_index._keys) label = self.label_index.find_le(sent_start + start) except ValueError: label = None if label: off = label.offsets[OffsetType.BYTES] if off.first + off.length > sent_start + start: logger.info("overlapping label: %r" % label.target.target_id) ## overlaps streamcorpus.add_annotation(tok, label) assert label.annotator.annotator_id in tok.labels logger.info("adding label to tok: %r has %r", tok.token, label.target.target_id) if label in self.label_to_mention_id: mention_id = self.label_to_mention_id[label] else: mention_id = new_mention_id new_mention_id += 1 self.label_to_mention_id[label] = mention_id tok.mention_id = mention_id token_num += 1 sentence_pos += 1 sent.tokens.append(tok) sentences.append(sent) return sentences
def clear(self): self.tok_num = 0 self.byte_idx = 0 self.line_idx = 0 self.word_tokenizer = WhitespaceTokenizer()
class Prototype: """Prototype system that searches for RDF pattern (aka Q-Calculus pattern) to find textsnippets.""" def __init__(self, mongo_db, postgre_db, sentence_mode=True, punctuation_mode=False, window_size=0): """Initialize a prototype with a specified configurations. Parameters: mongo_db -- Mongo DB connection postgre_db -- PostGre DB connection sentence_mode -- whether or not to use sentence window mode (default True) window_size -- the size of the sentence or word window (default 0) """ self.__mongo_db = mongo_db self.__postgre_db = postgre_db self.__sentence_mode = sentence_mode self.___punctuation_mode = punctuation_mode self.__window_size = window_size self.tokenizer = WhitespaceTokenizer() self.parser = Parser() def exit(self): """Close down the prototype.""" self.__mongo_db.close_connection() self.__postgre_db.close_connection() def create_new_collection(self, schema_name): self.__postgre_db.create_schema(schema_name) def get_window_size(self): """Gets the current window size.""" return self.__window_size def get_sentence_mode(self): """Returns True if sentence window mode is activated, else False.""" return self.__sentence_mode def change_window_size(self, size): """Change the current window size to a new size.""" value = 0 try: value = int(size) except ValueError: raise ValueError("Please type in a valid number.") if value >= 0: self.__window_size = value else: raise ValueError("Please type in a valid positive number.") def activate_sentence_window_mode(self): """Activate sentence window mode.""" self.__sentence_mode = True def activate_word_window_mode(self): """De-activate sentence window mode.""" self.__sentence_mode = False def activate_punctuation_mode(self): self.___punctuation_mode = True def deactivate_punctuation_mode(self): self.___punctuation_mode = False def get_punctuation_mode(self): return self.___punctuation_mode def get_word_window(self, pattern, tokens, constraints): """Get a word window list with a specific number of words. Parameters: pattern -- the pattern to search for tokens -- the tokens to search in constraints -- a constraint tuple list """ split_pattern = pattern.split() if len(split_pattern) > 1: textsnippets = self.__get_word_window_more_words_help(split_pattern, tokens, constraints) else: textsnippets = self.__get_word_window_one_word_help(pattern, tokens, constraints) return textsnippets def __get_word_window_more_words_help(self, split_pattern, tokens, constraints): """Find pattern with more than one word. """ textsnippets = [] textlength = len(tokens) for ind, token in enumerate(tokens): p_index = 0 end_index = ind while p_index < len(split_pattern): if self.check_pattern(split_pattern[p_index], tokens[end_index]): p_index += 1 end_index += 1 else: break if p_index == len(split_pattern): if constraints is not None: self.__check_constraints(constraints, (ind, end_index - 1), ind, split_pattern, None, None, textsnippets, tokens) else: pattern = " ".join(item for item in split_pattern) self.__get_word_window_help((ind, end_index - 1), textsnippets, textlength, tokens, pattern) return textsnippets def __get_word_window_one_word_help(self, pattern, tokens, constraints): """Find pattern with only one word.""" textsnippets = [] textlength = len(tokens) for ind, token in enumerate(tokens): if self.check_pattern(pattern, token): if constraints is not None: self.__check_constraints(constraints, (ind, ind), ind, pattern, None, None, textsnippets, tokens) else: self.__get_word_window_help((ind, ind), textsnippets, textlength, tokens, pattern) return textsnippets def __get_word_window_help(self, token_pos, textsnippets, textlength, tokens, pattern): snippet = self.__get_textsnippets(token_pos[0], token_pos[1], textlength, tokens) offset_start = re.search(pattern, snippet).span()[0] offset_end = offset_start + (len(pattern) - 1) SentObj = namedtuple('Sentence_Object', ['snippet', 'offset_start', 'offset_end']) textsnippets.append(SentObj(snippet=snippet, offset_start=offset_start, offset_end=offset_end)) def __get_textsnippets(self, indl, indr, textlength, tokens): if (indl - self.__window_size < 0) and (indr + self.__window_size > textlength): left_index = self.__window_size - 1 while not (indl - left_index) == 0: left_index -= 1 right_index = self.__window_size - 1 while not (indr + right_index) == textlength: right_index -= 1 return " ".join(tokens[indl - left_index:indr + right_index]) elif indr + self.__window_size > textlength: right_index = self.__window_size - 1 while not (indr + right_index) == textlength: right_index -= 1 return " ".join(tokens[indl - self.__window_size:indr + right_index]) elif indl - self.__window_size < 0: left_index = self.__window_size - 1 while not (indl - left_index) == 0: left_index -= 1 return " ".join(tokens[indl - left_index:indr + self.__window_size + 1]) else: return " ".join(tokens[indl - self.__window_size:indr + (self.__window_size + 1)]) def get_sentence_window(self, pattern, sentences, constraints): """Get a list with a specific number of sentences. size 0 will return the current sentence the pattern is found in. size n will return n sentences left and right from the initial sentence. Parameters: pattern -- the pattern to search for sentences -- the sentences to search in constraints -- the constraint tuple list """ split_pattern = pattern.split() if len(split_pattern) > 1: textsnippets = self.__get_sentence_window_more_words(split_pattern, sentences, constraints) else: textsnippets = self.__get_sentence_window_one_word(pattern, sentences, constraints) return textsnippets def __get_sentence_window_one_word(self, pattern, sentences, constraints): """Get sentence snippets with pattern containing of only one words according to window size.""" textsnippets = [] for ind, sent in enumerate(sentences): tokens = self.tokenizer.tokenize(sent) for i, token in enumerate(tokens): if self.check_pattern(pattern, token): if constraints is not None: self.__check_constraints(constraints, (i, i), ind, pattern, sent, sentences, textsnippets, tokens) else: self.__get_sentence_window_help(ind, sentences, textsnippets, pattern) return textsnippets def __check_constraints(self, constraints, token_pos, sent_num, pattern, sent, sentences, textsnippets, tokens): """Traverse the given list of constraints and find target words near the keyword. The number of word distance is given in the constraint list. add_info[0] is the keyword aka pattern. add_info[1] is the target_word aka the constraint. add_info[2] is the word distance from constraint to the pattern.""" pos = 0 more_words_flag = False if token_pos[0] == token_pos[1]: pos = token_pos[0] else: more_words_flag = True for add_info in constraints: # find pattern that matches target word index = add_info[2] found_constraint_flag = True if more_words_flag: constraint = add_info[0].split() i = 0 while found_constraint_flag and i < len(pattern) and i < len(constraint): if self.check_pattern(pattern[i], constraint[i]): pass else: found_constraint_flag = False break i += 1 if found_constraint_flag or self.check_pattern(pattern, add_info[0]): # set token_pos depending if index is positive or negative if more_words_flag and index > 0: pos = token_pos[1] elif more_words_flag and index < 0: pos = token_pos[0] if self.__sentence_mode: if (0 <= pos + index < len(tokens)) and self.check_pattern(add_info[1], tokens[pos + index]): self.__get_sentence_window_help(sent_num, sentences, textsnippets, pattern) else: while index != 0: if index > 0: index -= 1 else: index += 1 if (0 < pos + index < len(tokens)) and self.check_pattern(add_info[1], tokens[pos + index]): self.__get_sentence_window_help(sent_num, sentences, textsnippets, pattern) break else: if (0 <= pos + index < len(tokens)) and self.check_pattern(add_info[1], tokens[pos + index]): self.__get_word_window_help(token_pos, textsnippets, len(tokens), tokens, pattern) else: while index != 0: if index > 0: index -= 1 else: index += 1 if (0 < pos + index < len(tokens)) and self.check_pattern(add_info[1], tokens[pos + index]): self.__get_word_window_help(token_pos, textsnippets, sent, tokens, pattern) break def __get_sentence_window_help(self, ind, sentences, textsnippets, pattern): sentence = self.__get_sentences(ind, sentences) # get offsets offset_start = re.search(pattern, sentence).span()[0] offset_end = offset_start + (len(pattern) - 1) SentObj = namedtuple('Sentence_Object', ['snippet', 'offset_start', 'offset_end']) textsnippets.append(SentObj(snippet=sentence, offset_start=offset_start, offset_end=offset_end)) def __get_sentence_window_more_words(self, split_pattern, sentences, constraints): """Get sentence snippets with pattern containing of more than 2 words according to window size.""" textsnippets = [] for ind, sent in enumerate(sentences): tokens = self.tokenizer.tokenize(sent) p_index = 0 begin_index = 0 end_index = 0 while p_index < len(split_pattern): if (end_index < len(tokens)) and self.check_pattern(split_pattern[p_index], tokens[end_index]): if p_index == 0: begin_index = end_index else: begin_index = begin_index + end_index - end_index p_index += 1 end_index += 1 else: break end_index -= 1 if p_index == len(split_pattern): # search for constraints in sentence if constraints is not None: self.__check_constraints(constraints, (begin_index, end_index), ind, split_pattern, sent, sentences, textsnippets, tokens) else: pattern = " ".join(item for item in split_pattern) self.__get_sentence_window_help(ind, sentences, textsnippets, pattern) return textsnippets def __get_sentences(self, ind, sentences): if self.__window_size == 0: return sentences[ind] elif self.__window_size > 0: left_window_border = ind - self.__window_size right_window_border = ind + self.__window_size + 1 if left_window_border < 0: left_window_border = 0 if right_window_border >= len(sentences): right_window_border = len(sentences) return " ".join(sentences[left_window_border:right_window_border]) def find_text_window(self, schema, text, text_id, constraints=None): """Finds text windows with variable size and pushes the found results in the PostGre database. Parameters: text -- text to search in text_id -- id of the text constraints -- the constraint tuple list""" # this is only a quick and dirty fix: replace weird quotes to basic ones for ch in ['›', '‹', '»', '«']: if ch in text: text = text.replace(ch, '"') tokenized_text = self.tokenizer.tokenize(text) if self.___punctuation_mode: punctuation_text = re.split('[!?.,;:]', text) punctuation_text = [item for item in punctuation_text if item != ''] for pattern in self.__postgre_db.get_data_from_table(schema, "single_pattern"): if self.___punctuation_mode and self.__sentence_mode: windows_objects = self.get_sentence_window( pattern['single_pattern'], punctuation_text, constraints) elif self.__sentence_mode: windows_objects = self.get_sentence_window( pattern['single_pattern'], sent_tokenize(text, language='german'), constraints) else: windows_objects = self.get_word_window(pattern['single_pattern'], tokenized_text, constraints) # push found snippets onto database if len(windows_objects) > 0: single_pattern_id = pattern['id'] for sent_obj in windows_objects: # push snippets self.__push_snippets(schema, sent_obj.snippet) snippet_id = self.__postgre_db.get_id(schema,"snippets", "snippet=" + add_quotes( replace_special_characters(sent_obj.snippet))) # push relations self.__push_texts_snippets(schema, text_id, snippet_id) self.__push_snippet_offsets(schema, single_pattern_id, snippet_id, sent_obj.offset_start, sent_obj.offset_end) def __push_snippets(self, schema, snippet): """Push found snippets onto the snippets table in PostGre DB, if not already in the table. Afterwards push the single_pattern and snippets relation.""" if not self.__postgre_db.is_in_table(schema, "snippets", "snippet=" + add_quotes( replace_special_characters(snippet))): self.__postgre_db.insert(schema,"snippets", {"snippet": snippet}) def __push_texts_snippets(self, schema, text_id, snippet_id): """Get all saved snippets that occur in a text and push them onto PostGre DB.""" self.__push_relation(schema, text_id, snippet_id, "text_id", "snippet_id", "texts_snippets") def __push_snippet_offsets(self, schema, single_pattern_id, snippet_id, offset_start, offset_end): """Push found single_pattern in snippets and their respective offset.""" if not self.__postgre_db.is_in_table( schema, "snippet_offsets", "single_pattern_id=" + str(single_pattern_id) + " and snippet_id=" + str( snippet_id)): self.__postgre_db.insert(schema, "snippet_offsets", { "single_pattern_id": single_pattern_id, "snippet_id": snippet_id, "offsets": [ [offset_start, offset_end]]}) else: old_list = self.__postgre_db.get(schema, "snippet_offsets", "single_pattern_id=" + str( single_pattern_id) + " and snippet_id=" + str(snippet_id), "offsets") old_list.append([offset_start, offset_end]) pid = self.__postgre_db.get_id(schema, "snippet_offsets", "single_pattern_id=" + str( single_pattern_id) + " and snippet_id=" + str(snippet_id)) self.__postgre_db.update(schema, "snippet_offsets", "offsets=" + add_quotes(replace_brackets(str( old_list))), "id=" + str(pid)) def __push_relation(self, schema, id1, id2, id1_name, id2_name, table): """Push a relation onto the PostGre DB. The relation has to have a primary key.""" # case: No entry about relation is in DB yet if not self.__postgre_db.is_in_table(schema, table, id1_name + "=" + str( id1)): self.__postgre_db.insert(schema, table, { id1_name: id1, id2_name: [id2], "aggregation": 0}) # case: Entry about single_pattern is in DB else: old_list = self.__postgre_db.get(schema, table, id1_name + "=" + str( id1), id2_name) new_list = list(set(old_list + [id2])) self.__postgre_db.update(schema, table, id2_name + "=" + add_quotes(replace_brackets(str( new_list))), id1_name + "=" + str(id1)) def __push_aggregation_lowest_layer(self, schema, aggregation_object, aggregation_name, table, id_name): """Push the aggregated snippet numbers onto corresponding the lower layer tables.""" for aggregation in aggregation_object: id = aggregation[aggregation_name][0] aggregation_value = aggregation[aggregation_name][1] self.__postgre_db.update(schema, table, "aggregation=" + str(aggregation_value), id_name + "=" + str(id)) def __push_aggregation(self, schema, table, sub_table, table_id, sub_table_id): """Calculate and push aggregation on the rest layer tables.""" table_entries = self.__postgre_db.get_data_from_table(schema, table) for entry in table_entries: aggregation = 0 entry_id = entry[table_id] entries_to_look_up = entry[sub_table_id] for look_up in entries_to_look_up: query = "SELECT SUM(aggregation) FROM " + schema + "." + sub_table + " WHERE " + sub_table_id + "=" + str(look_up) stored_value = self.__postgre_db.query(query)[0]['sum'] if stored_value is None: stored_value = 0 aggregation += stored_value self.__postgre_db.update(schema, table, "aggregation=" + str(aggregation), table_id + "=" + str(entry_id)) def get_snippets(self, schema, constraints): """Get snippets for the whole corpus. Parameter: constraints -- the constraint tuple list""" for ind, text in enumerate(self.__mongo_db.get(schema, {})): self.__postgre_db.insert(schema, "texts", {"title": text['title']}) self.find_text_window(schema, text['text'], text['id'], constraints) print("Finished extracting snippets from chapter " + str(text['id']) + ".") def aggregation(self, schema): """Calculate aggregation bottom-up and store the interim data onto the database.""" aggregation_texts_snippets = self.__postgre_db.query("SELECT " + schema + ".aggregate_texts_snippets()") aggregation_snippet_offsets = self.__postgre_db.query("SELECT " + schema + ".aggregate_snippet_offsets()") # push 2 lowest levels of the hierarchy self.__push_aggregation_lowest_layer(schema, aggregation_texts_snippets, str('aggregate_texts_snippets'), "texts_snippets", "text_id") self.__push_aggregation_lowest_layer(schema, aggregation_snippet_offsets, str('aggregate_snippet_offsets'), "snippet_offsets", "id") # push rest of the hierarchy self.__push_aggregation(schema, "pattern_single_pattern", "snippet_offsets", str('pattern_id'), str('single_pattern_id')) self.__push_aggregation(schema, "has_object", "pattern_single_pattern", str('bscale_id'), str('pattern_id')) self.__push_aggregation(schema, "has_attribute", "has_object", str('bsort_id'), str('bscale_id')) def aggregate_bscale(self, schema, new_bscale, bsort, scale_type, *args): pattern_info = self.__add_new_bscale(schema, new_bscale, bsort, scale_type, *args) if pattern_info is not None: pattern_ids = pattern_info[0] new_bscale_id = pattern_info[1] new_pattern_list = list(set.union(*[set(item) for item in pattern_ids])) aggregation = 0 for item in new_pattern_list: aggregation += self.__postgre_db.get(schema, "pattern_single_pattern", "pattern_id=" + str(item), "aggregation") self.__postgre_db.insert(schema, "has_object", {"bscale_id": new_bscale_id, "pattern_id": new_pattern_list, "aggregation": aggregation}) def intersect_bscale(self, schema, new_bscale, bsort, scale_type, *args): pattern_info = self.__add_new_bscale(schema, new_bscale, bsort, scale_type, *args) if pattern_info is not None: pattern_ids = pattern_info[0] new_bscale_id = pattern_info[1] new_pattern_list = list(set.intersection(*[set(item) for item in pattern_ids])) aggregation = 0 for item in new_pattern_list: aggregation += self.__postgre_db.get(schema, "pattern_single_pattern", "pattern_id=" + str(item), "aggregation") self.__postgre_db.insert(schema, "has_object", {"bscale_id": new_bscale_id, "pattern_id": new_pattern_list, "aggregation": aggregation}) def __add_new_bscale(self, schema, new_bscale, bsort, scale_type, *args): if args is not None: bscale_table = self.__postgre_db.get_data_from_table(schema, "bscale") bscale_ids = [] for scale in args: scale_found = False for bscale in bscale_table: if scale == bscale['bscale']: bscale_ids.append(bscale['id']) scale_found = True if not scale_found: raise Exception("Chosen Bscale does not exist.") if not self.__postgre_db.is_in_table(schema, "bscale", "bscale=" + add_quotes(new_bscale)): self.__postgre_db.insert(schema, "bscale", {"bscale": new_bscale, "nominal": False, "ordinal": False, "interval": False}) new_bscale_id = self.__postgre_db.get_id(schema, "bscale", "bscale=" + add_quotes(new_bscale)) self.__postgre_db.update(schema, "bscale", scale_type + "=" + add_quotes('True'), "id=" + str(new_bscale_id)) bsort_id = self.__postgre_db.get_id(schema, "bsort", "bsort=" + add_quotes(bsort)) if self.__postgre_db.is_in_table(schema, "has_attribute", "bsort_id=" + str(bsort_id)): old_list = self.__postgre_db.get(schema, "has_attribute", "bsort_id=" + str(bsort_id), "bscale_id") old_list.append(new_bscale_id) self.__postgre_db.update(schema, "has_attribute", "bscale_id=" + add_quotes( replace_brackets(str(old_list))), "bsort_id=" + str(bsort_id)) else: self.__postgre_db.insert(schema, "has_attribute", {"bsort_id": bsort_id, "bscale_id": [new_bscale_id], "aggregation": 0}) scale_obj = self.__postgre_db.get_data_from_table(schema, "has_object") pattern_ids = [] for scale_id in bscale_ids: for item in scale_obj: if scale_id == item['bscale_id']: pattern_ids.append(item['pattern_id']) return (pattern_ids, new_bscale_id) def find_correlating_pattern(self, schema): all_snippets_table = self.__postgre_db.get_data_from_table(schema, "snippets") all_snippets = [snippet['snippet'] for snippet in all_snippets_table] all_bscales_table = self.__postgre_db.get_data_from_table(schema, "bscale") all_bscales = [bscale['id'] for bscale in all_bscales_table] for bscale_id in all_bscales: pattern_list = self.__postgre_db.get(schema, "has_object", "bscale_id=" + str(bscale_id), "pattern_id") for pattern_id in pattern_list: single_pattern_id_list = self.__postgre_db.get( schema, "pattern_single_pattern", "pattern_id=" + str(pattern_id), "single_pattern_id") for single_pattern_id in single_pattern_id_list: single_pattern = self.__postgre_db.get(schema, "single_pattern", "id=" + str(single_pattern_id), "single_pattern") self.__postgre_db.insert(schema, "bscale_single_pattern", {"bscale_id": bscale_id, "single_pattern_id": single_pattern_id, "single_pattern": single_pattern , "count": 0}) for snippet in self.parser.nlp.pipe(all_snippets, batch_size=3000, n_threads=-1): correlating_pattern = self.parser.get_correlating_nouns_and_adjectives(snippet) for ind, item in enumerate(correlating_pattern): if self.__postgre_db.is_in_table(schema, "bscale_single_pattern", "single_pattern=" + add_quotes(item)): pattern_id = self.__postgre_db.get(schema, "bscale_single_pattern", "single_pattern=" + str(add_quotes(item)), "single_pattern_id") index = ind + 1 while index < len(correlating_pattern): next_item = correlating_pattern[index] if self.__postgre_db.is_in_table(schema, "bscale_single_pattern", "single_pattern=" + add_quotes(next_item)): pattern_next_item_id = self.__postgre_db.get(schema, "bscale_single_pattern", "single_pattern=" + str(add_quotes(next_item)), "single_pattern_id") if pattern_id != pattern_next_item_id: first_combination_in_table = self.__postgre_db.is_in_table( schema, "correlating_pattern", "pattern_a=" + str(pattern_id) + " and pattern_b=" + str(pattern_next_item_id)) second_combination_in_table = self.__postgre_db.is_in_table( schema, "correlating_pattern", "pattern_a=" + str(pattern_next_item_id) + " and pattern_b=" + str(pattern_id)) # update entry if already exists in table correlating_pattern if first_combination_in_table: old_count = self.__postgre_db.get(schema, "correlating_pattern", "pattern_a=" + str(pattern_id) + " and pattern_b=" + str(pattern_next_item_id), "count") new_count = old_count + 1 self.__postgre_db.update(schema, "correlating_pattern", "count=" + str(new_count), "pattern_a=" + str(pattern_id) + " and pattern_b=" + str(pattern_next_item_id)) elif second_combination_in_table: old_count = self.__postgre_db.get(schema, "correlating_pattern", "pattern_a=" + str( pattern_next_item_id) + " and pattern_b=" + str(pattern_id), "count") new_count = old_count + 1 self.__postgre_db.update(schema, "correlating_pattern", "count=" + str(new_count), "pattern_a=" + str(pattern_next_item_id) + " and pattern_b=" + str( pattern_id)) else: # create new entry for pattern pair if none exists self.__postgre_db.insert(schema, "correlating_pattern", { "pattern_a": pattern_id, "pattern_b": pattern_next_item_id, "count": 1}) index += 1 def find_spo_and_adjectives(self, schema): all_snippets_table = self.__postgre_db.get_data_from_table(schema, "snippets") all_snippets = [snippet['snippet'] for snippet in all_snippets_table] for snippet in self.parser.nlp.pipe(all_snippets, batch_size=3000, n_threads=-1): spo = self.parser.get_SVO(snippet) for item in spo: if item is not None: # subject is pattern if item.subject != "'": if self.__postgre_db.is_in_table(schema, "single_pattern", "single_pattern=" + add_quotes(item.subject)): self.push_parser_items(schema, item.subject, "subject_occ", "subject") self.push_parser_items(schema, item.verb, "verb_occ", "verb") self.push_parser_item_relationship( schema, item.subject, item.verb, "subject_verb_occ", "subject", "verb") if item.object != '': self.push_parser_items(schema, item.object, "object_occ", "object") self.push_parser_item_relationship(schema, item.subject, item.object, "subject_object_occ", "subject", "object") #object is pattern elif self.__postgre_db.is_in_table(schema, "single_pattern", "single_pattern=" + add_quotes(item.object)): self.push_parser_items(schema, item.object, "object_occ", "object") self.push_parser_items(schema, item.verb, "verb_occ", "verb") self.push_parser_item_relationship(schema, item.object, item.verb, "object_verb_occ", "object", "verb") if item.subject != '': self.push_parser_items(schema, item.subject, "subject_occ", "subject") self.push_parser_item_relationship(schema, item.subject, item.object, "subject_object_occ", "subject", "object") noun_adjectives = self.parser.nouns_adj_spacy(snippet) for item in noun_adjectives: subject = item['noun'] adjective = item['adj'] if self.__postgre_db.is_in_table( schema, "single_pattern", "single_pattern=" + add_quotes(item['noun'])): self.push_parser_items(schema, subject, "subject_occ", "subject") self.push_parser_items(schema, adjective, "adjective_occ", "adjective") self.push_parser_item_relationship( schema, subject, adjective, "subject_adjective_occ", "subject", "adjective") def push_parser_items(self, schema, word, table, word_type): if not self.__postgre_db.is_in_table(schema, table, word_type + "=" + add_quotes(word)): self.__postgre_db.insert(schema, table, {word_type: word, "count": 0}) def push_parser_item_relationship(self, schema, word1, word2, table, word_type1, word_type2): word1_id = self.__postgre_db.get_id(schema, word_type1 + "_occ", word_type1 + "=" + add_quotes(word1)) word2_id = self.__postgre_db.get_id(schema, word_type2 + "_occ", word_type2 + "=" + add_quotes(word2)) if not self.__postgre_db.is_in_table(schema, table, word_type1 + "=" + str( word1_id) + " and " + word_type2 + "=" + str(word2_id)): self.__postgre_db.insert(schema, table, {word_type1: word1_id, word_type2: word2_id, "count": 1}) else: table_id = self.__postgre_db.get_id(schema, table, word_type1 + "=" + str(word1_id) + " and " + word_type2 + "=" + str(word2_id)) old_count = self.__postgre_db.get(schema, table, "id=" + str(table_id), "count") self.__postgre_db.update(schema, table, "count=" + str(old_count + 1), "id=" + str(table_id)) def aggregate_occurences_help(self, text_counter, word): count = text_counter[word] if count == 0: return 1 else: return count def calculate_pmi(self, schema): print("Calculating PMI for " + schema) corpus_count = 0 for item in self.__mongo_db.get(schema, {}): corpus_count += len(word_tokenize(item['text'])) print(corpus_count) print("Lemmatizing corpus.") lemmatized_text = [] for ind, text in enumerate(self.__mongo_db.get(schema, {})): doc = text['text'] for ch in ['›', '‹', '»', '«']: if ch in doc: doc = doc.replace(ch, '"') lemmatized_text += self.parser.lemmatize_chunk(doc) print("Part " + str(ind) + " lemmatized.") self.aggregate_occurences(schema, "subject", lemmatized_text) self.aggregate_occurences(schema, "object", lemmatized_text) self.aggregate_occurences(schema, "adjective", lemmatized_text) self.aggregate_occurences(schema, "verb", lemmatized_text) print("Finished aggregating occurences.") self.calculate_pmi_helper(schema, corpus_count, "subject_adjective_occ", "subject", "adjective") self.calculate_pmi_helper(schema, corpus_count, "subject_verb_occ", "subject", "verb") self.calculate_pmi_helper(schema, corpus_count, "subject_object_occ", "subject", "object") self.calculate_pmi_helper(schema, corpus_count, "object_verb_occ", "object", "verb") def aggregate_occurences(self, schema, word_table, lemmatized_text): table = self.__postgre_db.get_data_from_table(schema, word_table + "_occ") for item in table: word = item[word_table] split_word = word.split(" ") length = len(split_word) if length > 1: if length == 2: counter = list(bigrams(lemmatized_text)) word_tuple = (split_word[0], split_word[1]) elif length == 3: counter = list(trigrams(lemmatized_text)) word_tuple = (split_word[0], split_word[1], split_word[2]) else: counter = [] count = counter.count(word_tuple) else: word = item[word_table] count = self.aggregate_occurences_help(Counter(lemmatized_text), word) print(word, str(count)) self.__postgre_db.update(schema, word_table + "_occ", "count=" + str(count), "id=" + str(item['id'])) def calculate_pmi_helper(self, schema, corpus_count, co_occurence, word1, word2): co_occ_table = self.__postgre_db.get_data_from_table(schema, co_occurence) for item in co_occ_table: item_id = item['id'] co_occ_freq = float(item['count'] / corpus_count) word1_id = item[word1] word2_id = item[word2] word1_occ = self.__postgre_db.get(schema, word1 + "_occ", "id=" + str(word1_id), "count") word2_occ = self.__postgre_db.get(schema, word2 + "_occ", "id=" + str(word2_id), "count") pmi = log2(co_occ_freq / (float(word1_occ / corpus_count) * float(word2_occ / corpus_count))) self.__postgre_db.update(schema, co_occurence, "pmi=" + str(pmi), "id=" + str(item_id)) def calculate_pmi_use_case2(self, schema): print("Calculating PMI for " + schema) corpus_count = 0 text = [] for item in self.__mongo_db.get(schema, {}): text += word_tokenize(item['text'], language='german') corpus_count += len(word_tokenize(item['text'], language='german')) print(corpus_count) counter = Counter(text) single_pattern_table = self.__postgre_db.get_data_from_table(schema, "bscale_single_pattern") # counting single pattern occurrences for item in single_pattern_table: word = item['single_pattern'] count = counter[word] self.__postgre_db.update(schema, "bscale_single_pattern", "count=" + str(count), "single_pattern=" + add_quotes(word)) # pmi calculation co_occ_table = self.__postgre_db.get_data_from_table(schema, "correlating_pattern") for item in co_occ_table: item_id = item['id'] co_occ_freq = float(item['count'] / corpus_count) word1_id = item['pattern_a'] word2_id = item['pattern_b'] word1_occ = self.__postgre_db.get(schema, "bscale_single_pattern", "id=" + str(word1_id), "count") print(word1_occ) word2_occ = self.__postgre_db.get(schema, "bscale_single_pattern", "id=" + str(word2_id), "count") print(word2_occ) pmi = log2(co_occ_freq / (float(word1_occ / corpus_count) * float(word2_occ / corpus_count))) print(pmi) self.__postgre_db.update(schema, "correlating_pattern", "pmi=" + str(pmi), "id=" + str(item_id)) def get_results_use_case2(self, schema): print("Colour + Nature") pprint(self.__postgre_db.query( """SELECT C.pmi, S.single_pattern AS pattern_a, T.single_pattern AS pattern_b FROM """ + schema + """.correlating_pattern C, """ + schema + """.bscale_single_pattern S, """ + schema + """.bscale_single_pattern T WHERE (S.bscale_id = 1 AND T.bscale_id = 2 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) OR (S.bscale_id = 2 AND T.bscale_id = 1 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) ORDER BY pmi DESC""")) print("Colour + Location") pprint(self.__postgre_db.query( """SELECT C.pmi, S.single_pattern AS pattern_a, T.single_pattern AS pattern_b FROM """ + schema + """.correlating_pattern C, """ + schema + """.bscale_single_pattern S, """ + schema + """.bscale_single_pattern T WHERE (S.bscale_id = 1 AND T.bscale_id = 3 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) OR (S.bscale_id = 3 AND T.bscale_id = 1 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) ORDER BY pmi DESC""")) print("Colour + Social") pprint(self.__postgre_db.query( """SELECT C.pmi, S.single_pattern AS pattern_a, T.single_pattern AS pattern_b FROM """ + schema + """.correlating_pattern C, """ + schema + """.bscale_single_pattern S, """ + schema + """.bscale_single_pattern T WHERE (S.bscale_id = 1 AND T.bscale_id = 4 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) OR (S.bscale_id = 4 AND T.bscale_id = 1 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) ORDER BY pmi DESC""")) print("Nature + Location") pprint(self.__postgre_db.query( """SELECT C.pmi, S.single_pattern AS pattern_a, T.single_pattern AS pattern_b FROM """ + schema + """.correlating_pattern C, """ + schema + """.bscale_single_pattern S, """ + schema + """.bscale_single_pattern T WHERE (S.bscale_id = 2 AND T.bscale_id = 3 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) OR (S.bscale_id = 3 AND T.bscale_id = 2 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) ORDER BY pmi DESC""")) print("Nature + Social") pprint(self.__postgre_db.query( """SELECT C.pmi, S.single_pattern AS pattern_a, T.single_pattern AS pattern_b FROM """ + schema + """.correlating_pattern C, """ + schema + """.bscale_single_pattern S, """ + schema + """.bscale_single_pattern T WHERE (S.bscale_id = 2 AND T.bscale_id = 4 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) OR (S.bscale_id = 4 AND T.bscale_id = 2 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) ORDER BY pmi DESC""")) print("Location + Social") pprint(self.__postgre_db.query( """SELECT C.pmi, S.single_pattern AS pattern_a, T.single_pattern AS pattern_b FROM """ + schema + """.correlating_pattern C, """ + schema + """.bscale_single_pattern S, """ + schema + """.bscale_single_pattern T WHERE (S.bscale_id = 3 AND T.bscale_id = 4 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) OR (S.bscale_id = 4 AND T.bscale_id = 3 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) ORDER BY pmi DESC""")) def check_pattern(self, pattern, token): """Strip token and check if the token matches the defined pattern. Parameter: pattern -- the pattern to search for token -- the token to match with the pattern """ split_token = re.split('\W+', token) if split_token[0] == '': split_token = split_token[1] else: split_token = split_token[0] return split_token == pattern def get_result(self, schema): print(self.__postgre_db.query("""SELECT SUM(SV.count) FROM """ + schema + """.subject_verb_occ SV""")) print(self.__postgre_db.query("""SELECT SUM(SV.count) FROM """ + schema + """.object_verb_occ SV""")) print(self.__postgre_db.query("""SELECT SUM(SV.count) FROM """ + schema + """.subject_object_occ SV""")) print(self.__postgre_db.query("""SELECT SUM(SV.count) FROM """ + schema + """.subject_adjective_occ SV""")) pprint(self.__postgre_db.query("""SELECT S.subject, V.verb, SV.pmi FROM """ + schema + """.subject_verb_occ SV, """ + schema + """.subject_occ S, """ + schema + """.verb_occ V WHERE SV.subject = S.id AND SV.verb = V.id ORDER BY subject DESC, pmi DESC""")) pprint(self.__postgre_db.query("""SELECT O.object, V.verb, OV.pmi FROM """ + schema + """.object_verb_occ OV, """ + schema + """.object_occ O, """ + schema + """.verb_occ V WHERE OV.object = O.id AND OV.verb = V.id ORDER BY object DESC, pmi DESC""")) pprint(self.__postgre_db.query("""SELECT O.object, S.subject, SO.pmi FROM """ + schema + """.subject_object_occ SO, """ + schema + """.subject_occ S, """ + schema + """.object_occ O WHERE SO.object = O.id AND SO.subject = S.id ORDER BY subject DESC, pmi DESC""")) pprint(self.__postgre_db.query("""SELECT S.subject, A.adjective, SA.pmi FROM """ + schema + """.subject_adjective_occ SA, """ + schema + """.subject_occ S, """ + schema + """.adjective_occ A WHERE SA.subject = S.id AND SA.adjective = A.id ORDER BY subject DESC, pmi DESC"""))
def predict(**kwargs): import pandas as pd import numpy as np from model_bridging.helpers import (tokenize_pd_code, get_date_diff, add_emp_tenure_to_df, get_bucket_and_key_from_s3_uri, download_model_from_s3) from nltk.corpus import stopwords from nltk.stem.snowball import SnowballStemmer from nltk.tokenize import WhitespaceTokenizer import json import nltk nltk.download("stopwords") def stem_text(text): return " ".join([stemmer.stem(w) for w in w_tokenizer.tokenize(text)]) COLS_REQD = [ "ClaimNumber", "PrimaryDiagnosisCode", "SICCode", "InsuredGender", "InsuredSalaryIndicator", "DOTPrimaryExertionLevel", "CaseSize", "PrimaryDiagnosisDecription", "PrimaryDiagnosisCategory", "InsuredAgeatLoss", "InsuredAnnualizedSalary", "InsuredHireDate", "ReceivedDate", "LossDate", ] CATEGORICAL = [ "pd_code_1", "pd_code_2", "SIC_category", "InsuredGender", "InsuredSalaryIndicator", "DOTPrimaryExertionLevel", "CaseSize", "PrimaryDiagnosisCategory", ] VALID_CLAIM_STATUS_DESC = ["Benefit Case Under Review"] for artifact in kwargs.get("artifact"): if artifact.get("dataName") == "combined_artifacts": model_bucket, model_key = get_bucket_and_key_from_s3_uri( artifact.get("dataValue")) artifacts = download_model_from_s3(model_bucket, model_key) # Unpacking artifacts from the joblib object model = artifacts.get("model") tfidf_model = artifacts.get("tfidf_model") categorical_grouper = artifacts.get("categorical_grouper") train_template = artifacts.get("train_template") input_data = pd.DataFrame([kwargs.get("inputs").get("claim")]) date_cols = [x for x in input_data.columns if "date" in x.lower()] for col in date_cols: input_data.loc[:, col] = pd.to_datetime(input_data[col], errors="coerce") prediction_df = input_data[input_data["ClaimStatusDescription"].isin( VALID_CLAIM_STATUS_DESC)].copy() prediction_df = prediction_df.loc[~( (prediction_df["ClaimStatusDescription"] == "Benefit Case Under Review" ) & (prediction_df["ClaimStatusCode"] == "Closed")), :, ].copy() pred_features = prediction_df[COLS_REQD].copy().drop_duplicates() # tabular data preprocessing part 1 # Extract first 2 characters from SIC code pred_features.loc[:, "SIC_category"] = ( pred_features["SICCode"].astype(str).str[:2]) # sic category feature # split primary diagnosis code into two sub-codes pred_features = tokenize_pd_code(pred_features) # features from PD code # calculate employment tenure feature pred_features = add_emp_tenure_to_df(pred_features) # emp tenure feature # string salary range to number conversion pred_features.loc[:, "InsuredAnnualizedSalary"] = [ (float(op[0]) + float(op[1])) / 2 for op in pred_features["InsuredAnnualizedSalary"].fillna( "0-0").str.split("-") ] # salary feature # pivot operation around Claim Number and Approval date to get sequential info in single row prediction_df["approval_date_rank"] = ( prediction_df.groupby("ClaimNumber")["ApprovalDate"].rank( ascending=True).fillna(-1).astype(int)) # get the values from the earliest snapshot pivot_df = prediction_df.loc[ prediction_df.groupby("ClaimNumber").approval_date_rank.idxmin(), ["ClaimNumber", "BenefitCaseType", "DurationDate"], ].copy() pivot_df.rename({"DurationDate": "first_duration_date"}, axis=1, inplace=True) pivot_df = pivot_df.loc[pivot_df["BenefitCaseType"] == "STD"] # extract features for prediction pred_features = pivot_df.merge(pred_features, how="inner", on="ClaimNumber") # initial prognosis days feature pred_features.loc[:, "initial_prognosis_days"] = get_date_diff( pred_features["LossDate"], pred_features["first_duration_date"], "D") pred_features.loc[pred_features["initial_prognosis_days"] <= 0, "initial_prognosis_days"] = np.nan # text preprocessing pipeline for extracting features from # Primary Diagnosis Desc feature # initialize tokenizer, stemmer and stopwords from NLTK w_tokenizer = WhitespaceTokenizer() # lemmatizer = WordNetLemmatizer() stemmer = SnowballStemmer(language="english") stop = stopwords.words("english") # stop word removal and clean up pred_features.loc[:, "PrimaryDiagnosisDecription"] = ( pred_features["PrimaryDiagnosisDecription"].fillna("_na_").apply( lambda x: " ".join( [word for word in x.split(" ") if word not in (stop)]))) pred_features.loc[:, "PrimaryDiagnosisDecription"] = pred_features[ "PrimaryDiagnosisDecription"].str.replace("[^\w\s]", "") # stemming the cleaned text pred_features.loc[:, "pd_desc_stemmed"] = pred_features[ "PrimaryDiagnosisDecription"].apply(stem_text) # feature extraction from tf-idf vectorizer vocab = tfidf_model.get_feature_names() pred_desc_feat = tfidf_model.transform( pred_features.loc[:, "pd_desc_stemmed"]).toarray() pred_desc_feat = pd.DataFrame(pred_desc_feat, columns=vocab) # adding text features to the tabular data x_pred = pd.concat([pred_features, pred_desc_feat], axis=1) # preserving training dataset feature ordering x_pred_sub = x_pred[train_template.columns].copy() x_pred_sub[CATEGORICAL] = x_pred_sub[CATEGORICAL].copy().astype(str) x_pred_sub[CATEGORICAL] = categorical_grouper.transform( x_pred_sub[CATEGORICAL].copy(), CATEGORICAL) pred_features.loc[:, "predicted_probability"] = model.predict_proba( x_pred_sub)[:, 1] pred_features.loc[:, "predicted_bridged_ind"] = model.predict(x_pred_sub) pred_payload = pred_features[[ "ClaimNumber", "predicted_probability", "predicted_bridged_ind" ]] payload_json = json.loads(pred_payload.to_json(orient="records"))[0] claim_number = payload_json["ClaimNumber"] return [{ "inputDataSource": f"{claim_number}:0", "entityId": claim_number, "predictedResult": [{ 'claimNumber': claim_number, 'predictedProbability': payload_json['predicted_probability'], 'predicted_bridged_ind': payload_json['predicted_bridged_ind'] }] }]
from nltk.tokenize import WhitespaceTokenizer TOKENIZER = WhitespaceTokenizer() def read(file_name): try: f_in = '%s.txt' % file_name file_in = open(f_in, 'r') f_out = '%s.csv' % file_name file_out = open(f_out, 'wb') except Exception, e: raise e data = ', '.join( [TOKENIZER.tokenize(line)[1] for line in file_in] ) try: file_out.write(data) except Exception, e: raise e #read() if __name__ == "__main__": # Command line arguments import argparse parser = argparse.ArgumentParser( description='Converts a space two column space separted file into csv containing second column' ) parser.add_argument('file', help='The file to convert') args = parser.parse_args()
import re import nltk from nltk.tokenize import WhitespaceTokenizer from nltk import bigrams, trigrams import math from collections import Counter import time import networkx as nx import matplotlib.pyplot as plt import numpy as np from nltk.corpus import stopwords from nltk.corpus import brown from nltk.tag import UnigramTagger from nltk.tag.brill import SymmetricProximateTokensTemplate, ProximateTokensTemplate from nltk.tag.brill import ProximateTagsRule, ProximateWordsRule, FastBrillTaggerTrainer tokenizer = WhitespaceTokenizer() templates = [ SymmetricProximateTokensTemplate(ProximateTagsRule, (1,1)), SymmetricProximateTokensTemplate(ProximateTagsRule, (2,2)), SymmetricProximateTokensTemplate(ProximateTagsRule, (1,2)), SymmetricProximateTokensTemplate(ProximateTagsRule, (1,3)), SymmetricProximateTokensTemplate(ProximateWordsRule, (1,1)), SymmetricProximateTokensTemplate(ProximateWordsRule, (2,2)), SymmetricProximateTokensTemplate(ProximateWordsRule, (1,2)), SymmetricProximateTokensTemplate(ProximateWordsRule, (1,3)), ProximateTokensTemplate(ProximateTagsRule, (-1, -1), (1,1)), ProximateTokensTemplate(ProximateWordsRule, (-1, -1), (1,1)), ] tagged_sentences=[] tokenizer =WhitespaceTokenizer() with open("datascience_6.txt","r") as openfile:
def splitToWords(text): return WhitespaceTokenizer().tokenize(text)
#Undirected Graph #Boolean CoOccurrenceCounter with open("datascience545.txt","r") as openfile: Stopwords = nltk.corpus.stopwords.words('english') pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*') WordGramWordGram= Counter() WordGramBiGram = Counter() WordGramTriGram = Counter() BiGramBiGram = Counter() BiGramTriGram = Counter() TriGramTriGram = Counter() tokenizer = WhitespaceTokenizer() for line in openfile: words = line.lower().strip().replace('(',',').replace(')',',') words=re.sub(r'\~|\`|\@|\$|\%|\^|\&|\*|\(|\)|\_|\=|\{|\[|\}|\]|\\|\<|\,|\<|\.|\>|\?|\/|\;|\:|\"|\'', '',words) words = pattern.sub('', words) words=words.split('\r') words = [s.lstrip() for s in words] ReservoirALL={} for word in words: CountWordGrams = Counter() CountBiGrams = Counter() CountTriGrams = Counter() wordsplit= tokenizer.tokenize(word) wordsplit = [s.lstrip() for s in wordsplit] NoDupes = list(set(wordsplit))
def tokenize(text): tknzr = WhitespaceTokenizer() tokens = tknzr.tokenize(text) # tokens = nltk.word_tokenize(text) return tokens
# If text is empty, return None. if not text: return None sentence_tokenizer = _SENTENCE_TOKENIZER_DICT.get(sentence_tokenizer_id) return sentence_tokenizer(text) _word_tokenizer_default = word_tokenize _word_tokenizer_treebank = TreebankWordTokenizer().tokenize _word_tokenizer_regex = RegexpTokenizer(pattern=get_word_token_pattern(), gaps=False).tokenize _word_tokenizer_punkt = WordPunctTokenizer().tokenize _word_tokenizer_whitespace = WhitespaceTokenizer().tokenize _WORD_TOKENIZER_DICT = { 'default': _word_tokenizer_default, 'treebank': _word_tokenizer_treebank, 'regex': _word_tokenizer_regex, 'punkt': _word_tokenizer_punkt, 'whitespace': _word_tokenizer_whitespace } def word_tokenize(text, word_tokenizer_id='default'): """ Word-tokenizes a given sentence, based on a defined tokenizer. Args: sentence: A string, corresponding to a sentence.
class LingPipeParser(object): def __init__(self, config): self.clear() self.config = config def clear(self): self.tok_num = 0 self.byte_idx = 0 self.line_idx = 0 self.word_tokenizer = WhitespaceTokenizer() def set(self, ner_dom): self.clear() ## nltk wants a unicode string, so decode, it and then we will ## re-encode it to carefully recover the byte offsets. We ## must take care not to use any nltk components that insert ## new whitespace, such ## nltk.tokenize.treebank.TreebankTokenizer self.ner_dom = ner_dom self.attributes = [] self.relations = [] def sentences(self): ''' Iterate over <s> XML-like tags and tokenize with nltk ''' for sentence_id, node in enumerate(self.ner_dom.childNodes): ## increment the char index with any text before the <s> ## tag. Crucial assumption here is that the LingPipe XML ## tags are inserted into the original byte array without ## modifying the portions that are not inside the ## LingPipe-added tags themselves. if node.nodeType == node.TEXT_NODE: ## we expect to only see TEXT_NODE instances with whitespace assert only_whitespace.match(node.data), repr(node.data) ## must convert back to utf-8 to have expected byte offsets self.byte_idx += len(node.data.encode('utf-8')) ## count full lines, i.e. only those that end with a \n # 'True' here means keep the trailing newlines for line in node.data.splitlines(True): if line.endswith('\n'): self.line_idx += 1 else: logger.debug('getting tokens for sentence_id=%d' % sentence_id) more_sentence_remains = True while more_sentence_remains: ## always a sentence sent = Sentence() ## this "node" came from for loop above, and it's ## childNodes list might have been popped by a ## previous pass through this while loop tokens = iter( self.tokens( node ) ) while 1: try: tok = tokens.next() sent.tokens.append(tok) #logger.debug('got token: %r %d %d' % (tok.token, tok.mention_id, tok.sentence_pos)) except StopIteration: yield sent more_sentence_remains = False break def _make_token(self, start, end): ''' Instantiates a Token from self._input_string[start:end] ''' ## all thfift strings must be encoded first tok_string = self._input_string[start:end].encode('utf-8') if only_whitespace.match(tok_string): ## drop any tokens with only whitespace return None tok = Token() tok.token = tok_string tok.token_num = self.tok_num if 'BYTES' in self.config['offset_types']: tok.offsets[OffsetType.BYTES] = Offset( type = OffsetType.BYTES, first=self.byte_idx + len(self._input_string[:start].encode('utf-8')), length=len(tok_string), value=self.config['offset_debugging'] and tok_string or None, ) if 'LINES' in self.config['offset_types']: tok.offsets[OffsetType.LINES] = Offset( type = OffsetType.LINES, first=self.line_idx, length=1, value=self.config['offset_debugging'] and tok_string or None, ) self.tok_num += 1 ## keep track of position within a sentence tok.sentence_pos = self.sent_pos self.sent_pos += 1 return tok def tokens(self, sentence_dom): ''' Tokenize all the words and preserve NER labels from ENAMEX tags ''' ## keep track of sentence position, which is reset for each ## sentence, and used above in _make_token self.sent_pos = 0 ## keep track of mention_id, so we can distinguish adjacent ## multi-token mentions within the same coref chain mention_id = 0 while len(sentence_dom.childNodes) > 0: ## shrink the sentence_dom's child nodes. In v0_2_0 this ## was required to cope with HitMaxi16. Now it is just to ## save memory. node = sentence_dom.childNodes.pop(0) if node.nodeType == node.TEXT_NODE: ## process portion before an ENAMEX tag for line in node.data.splitlines(True): self._input_string = line for start, end in self.word_tokenizer.span_tokenize(line): tok = self._make_token(start, end) if tok: yield tok if line.endswith('\n'): ## maintain the index to the current line self.line_idx += 1 ## increment index pasat the 'before' portion self.byte_idx += len(line.encode('utf-8')) else: ## process text inside an ENAMEX tag assert node.nodeName == 'ENAMEX', node.nodeName chain_id = node.attributes.get('ID').value entity_type = node.attributes.get('TYPE').value for node in node.childNodes: assert node.nodeType == node.TEXT_NODE, node.nodeType for line in node.data.splitlines(True): self._input_string = line for start, end in self.word_tokenizer.span_tokenize(line): tok = self._make_token(start, end) if tok: if entity_type in _PRONOUNS: tok.mention_type = MentionType.PRO tok.entity_type = _ENTITY_TYPES[entity_type] ## create an attribute attr = Attribute( attribute_type=AttributeType.PER_GENDER, value=str(_PRONOUNS[entity_type]) ) self.attributes.append(attr) else: ## regular entity_type tok.mention_type = MentionType.NAME tok.entity_type = _ENTITY_TYPES[entity_type] tok.equiv_id = int(chain_id) tok.mention_id = mention_id yield tok if line.endswith('\n'): ## maintain the index to the current line self.line_idx += 1 ## increment index pasat the 'before' portion self.byte_idx += len(line.encode('utf-8')) ## increment mention_id within this sentence mention_id += 1
def __init__(self, *args, **kwargs): super(nltk_tokenizer, self).__init__(*args, **kwargs) self.sentence_tokenizer = PunktSentenceTokenizer() self.word_tokenizer = WhitespaceTokenizer() #PunktWordTokenizer()
def word_parser( input_str ): tokenizer = WhitespaceTokenizer() return tokenizer.tokenize( input_str )
""" import re import nltk from nltk.tokenize import WhitespaceTokenizer from nltk.corpus import stopwords from nltk.stem import PorterStemmer, WordNetLemmatizer text = "This is a block of text. I'm writing a piece to explain the usage of nltk packages." text = text.lower() #changes evrything lower case nopunct_text = re.sub('[^a-z0-9]', ' ', text) #remove non alphanumeric characters #tokenize tokens = WhitespaceTokenizer().tokenize(nopunct_text) #remove stopwords stop_words = set(stopwords.words('english')) filtered_tokens = [] for token in tokens: if token not in stop_words: filtered_tokens.append(token) #lemmatize and stem ps = PorterStemmer() lem = WordNetLemmatizer() stemmed_tokens = [] for token in filtered_tokens: