def extract_tokens(row, lemmatize=True, use_tag=True): tokenizer = WhitespaceTokenizer() if lemmatize: # reduce words to lemmas pattern = '[().*+,?!\'\";:]*' token_list = list() if use_tag: # use POS tags to obtain more accurate lemmas pos_tags = PerceptronTagger().tag(tokenizer.tokenize(row['text'])) lemmatizer_input = map( lambda x: (x[0], nltk_to_wordnet.get(x[1][0])), pos_tags) lemmatizer = WordNetLemmatizer() for word, tag in lemmatizer_input: if word != 'urlLink' and 'http:' not in word: word = word.lower() if tag is None: tok = lemmatizer.lemmatize(word) tok = re.sub(pattern, '', tok) if not tok.isdigit(): token_list.append(tok) else: tok = lemmatizer.lemmatize(word, tag) tok = re.sub(pattern, '', tok) if not tok.isdigit(): token_list.append(tok) else: # do not use a tagger if not specified and speed up computation lemmatizer_input = tokenizer.tokenize(row['text']) lemmatizer = WordNetLemmatizer() for word in lemmatizer_input: if word != 'urlLink' and 'http:' not in word: tok = lemmatizer.lemmatize(word.lower()) tok = re.sub(pattern, '', tok) if not tok.isdigit(): token_list.append(tok) else: # simply tokenize based on whitespaces token_list = tokenizer.tokenize(row['text']) return token_list
def _calculate_sentence_title_score(self, sentence): """Calculates a score based on how many words the sentence shares with the article title.""" title = self._remove_punctuation(self.title) sentence = self._remove_punctuation(sentence) tokenizer = WhitespaceTokenizer() tokenized_title = tokenizer.tokenize(title) tokenized_sentence = tokenizer.tokenize(sentence) common_words = set() for word in tokenized_sentence: if word in tokenized_title: common_words.add(word) score = float(len(common_words)) / len(tokenized_sentence) return SENTENCE_SCORE_WEIGHTS['title'] * score
def es_tokenize(sentence,cxt=False): tokenizer=WhitespaceTokenizer() token_sentence = [] if cxt: for i in sentence: tmp = [] for k in i: tmp.append(tokenizer.tokenize(k)) token_sentence.append(tmp) else: for i in sentence: token_sentence.append(tokenizer.tokenize(i)) return token_sentence
def main(tweet): #tweet = input("enter tweet here: ") tk = WhitespaceTokenizer() words = tk.tokenize(tweet) words_with_pos = pos_tag(words) queries = formQueries(words_with_pos) return scrapeWebForEachQuery(queries)
def read_session(lines): """ it takes a path to a transcription file and returns a dictionary that maps conversation id to a list of words. :param lines: <class '_io.TextIOWrapper'> remember: *v: non-Dutch words, *n: new non-existing words, *s: street words, *a: incomplete words, *u: distorted words, *x: unclear word, xxx: unclear utterances, vvv: non-Dutch sentences, ggg: sounds made by the speaker """ lines_to_words = lines.read() lines_to_words = re.sub('[0-9]*\.[0-9]*\t', '', lines_to_words) # to remove timestamps lines_to_words = re.sub( '[A-Za-z]*\*[anuxANUX]{1}', '', lines_to_words) # to remove words with *n, *a, *u, and *x lines_to_words = re.sub('[A-Za-z]*\*[etV]{1}', '', lines_to_words) # unknown notation lines_to_words = re.sub('[A-Za-z]*\*op', '', lines_to_words) # a mistake? lines_to_words = lines_to_words.replace('start\tend\ttext\n', '').replace('.', '').replace('-', ' ')\ .replace('?', '').replace('\n', ' ').replace('xxx', '').replace('ggg', '').replace('vvv', '')\ .replace('*v', '').replace('*s', '') lines_to_words = re.sub('[A-Za-z]*\*', '', lines_to_words) # for words with missing notation # s = lines_to_words.translate({ord(c): None for c in string.punctuation if c != '*'}) tk = WhitespaceTokenizer() words = tk.tokenize(lines_to_words) return words
def clean_text(text): """ Removes punctuation, capitalizations, numbers, stop words, and stems words""" ps = PorterStemmer() stop_words = set(stopwords.words('english')) text = text.lower() text = contractions.expandContractions(text) text = re.sub(r"what's", "what is ", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"can't", "cannot ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r"\'scuse", " excuse ", text) text = re.sub('\W', ' ', text) # remove punctuation text = re.sub('\s+', ' ', text) text = re.sub('\d+', ' ', text) # remove numbers text = re.sub( r'(.)\1\1+', r'\1\1', text) # letters repeated 3 or more times in a row are repeated twice text = re.sub(r'(ha)\1\1+', r'haha', text) text = re.sub(r'(lo)\1\1+', r'lol', text) text = text.strip(' ') # stem words tokenizer = WhitespaceTokenizer() tokenized_comment = tokenizer.tokenize(text) filtered_sentence = [w for w in tokenized_comment if not w in stop_words] stemmed_comment = [ps.stem(word) for word in filtered_sentence] text = " ".join(stemmed_comment) return text
class PreProcess: def __init__(self, corpus_content): self.wst = WhitespaceTokenizer() self.all_tokens = [] self.unique_token = [] self.bigrams_collection = [] self.bigrams_dict = defaultdict(list) self.trigrams_collection = [] self.trigrams_dict = defaultdict(list) self.tokenize(corpus_content) self.trigrams_collection_generator() self.markov_form(self.bigrams_collection, self.bigrams_dict) self.markov_form(self.trigrams_collection, self.trigrams_dict) def tokenize(self, corpus_content): self.all_tokens = self.wst.tokenize(corpus_content) self.unique_token = list(set(self.all_tokens)) self.bigrams_collection = list(bigrams(self.all_tokens)) def markov_form(self, collection, dictionary): for key, value in collection: dictionary[key].append(value) for key, value in dictionary.items(): dictionary[key] = Counter(value) def trigrams_collection_generator(self): for i in range(len(self.all_tokens) - 2): self.trigrams_collection.append( (" ".join([self.all_tokens[i], self.all_tokens[i + 1]]), self.all_tokens[i + 2]))
def whitespace_tokenizer(self, review): tokenizer = WhitespaceTokenizer() if self.features in [1, 2]: tokens = [ process_word(word.lower()) for word in tokenizer.tokenize(self.data[review]['Content']) ] else: tokens = [ word.lower() for word in tokenizer.tokenize(self.data[review]['Content']) ] tags = nltk.pos_tag(tokens) return tokens, tags
def withoutverbs(text): tokenizer = WhitespaceTokenizer() sent = nltk.pos_tag(tokenizer.tokenize(text)) return [ x for (x, y) in sent if ((y not in ('VBN')) and (y not in ('VBG')) and (y not in ('VBP')) and (y not in ('VB')) and (y not in ('VBD')) and (y not in ('RB'))) ]
def lemmatize_text(text): # lemmatize words w_tokenizer = WhitespaceTokenizer() lemmatizer = WordNetLemmatizer() words = [lemmatizer.lemmatize(w)+' ' for w in w_tokenizer.tokenize(text.lower())] text = ''.join(words) return text
def parse(self, corpus_filename, key): assert type(corpus_filename) == str, "the filename must be a string" assert type(key) == str, "the key must be a string" wst = WhitespaceTokenizer() with codecs.open(corpus_filename, encoding="utf8") as input: corpus = [wst.tokenize(l) for l in input] return {key: corpus}
def es_tokenize(sentence): tokenizer = WhitespaceTokenizer() token_sentence = [] for i in sentence: token_sentence.append(tokenizer.tokenize(i)) return token_sentence
def lemmatize_text(text): """ :param text: string :return: bag of words array """ lemmatizer = WordNetLemmatizer() w_tokenizer = WhitespaceTokenizer() return ' '.join(map(str, [lemmatizer.lemmatize(word) for word in w_tokenizer.tokenize(text)]))
def get_texts_raw(self): """ Parse documents analogously to SimpleCorpus.get_texts(), but tokenized by whitespace only """ wst = WhitespaceTokenizer() with self.getstream() as stream: for doc in stream: yield [word for word in wst.tokenize(utils.to_unicode(doc))]
def tokenizeDoc(self, doc): """ Get the tokens (words) from the doc uses nltk. """ #print ("Tokenizing doc") tokenizer = WhitespaceTokenizer() docTokens = tokenizer.tokenize(doc) return docTokens
def stemmed_words(text): stemmer = SnowballStemmer('english') w_tokenizer = WhitespaceTokenizer() wrdslist = [] for w in w_tokenizer.tokenize(text): lemwrd = stemmer.stem(w) wrdslist.append(lemwrd) return " ".join(wrdslist)
def tokenize_english_document(input_text): """ This is a crude tokenizer for input conversations in English. :param input_text: :return: """ end_list = [] block_tokenizer = BlanklineTokenizer() sentence_tokenizer = PunktSentenceTokenizer() word_tokenizer = WhitespaceTokenizer() # using the 38 characters in one line rule from ITV subtitle guidelines characters_per_line = 38 lines_per_subtitle = 2 blocks = block_tokenizer.tokenize(input_text) for block in blocks: # We have one speaker sentences = sentence_tokenizer.tokenize(block) # We have the sentences for sentence in sentences: words = word_tokenizer.tokenize(sentence) reverse_words = words[::-1] lines = [] current_line = '' line_full = False while reverse_words: word = reverse_words.pop() longer_line = ' '.join([current_line, word]).strip() if len(longer_line) > characters_per_line and len( current_line): # The longer line is overreaching boundaries reverse_words.append(word) line_full = True elif len(word) >= characters_per_line: # Very long words current_line = longer_line line_full = True else: current_line = longer_line if line_full: lines.append(current_line) current_line = '' line_full = False if len(lines) >= lines_per_subtitle: end_list.append(lines) lines = [] if current_line: lines.append(current_line) if lines: end_list.append(lines) return end_list
def correct_text(text, cdict): w = WhitespaceTokenizer() p = WordPunctTokenizer() token = w.tokenize(text=text) for i, s in enumerate(token): split = p.tokenize(s) for j, e in enumerate(split): if e in set(cdict.keys()): split[j] = cdict[e] token[i] = "".join(split) return " ".join(token)
def tokenize_english_document(input_text): """ This is a crude tokenizer for input conversations in English. :param input_text: :return: """ end_list = [] block_tokenizer = BlanklineTokenizer() sentence_tokenizer = PunktSentenceTokenizer() word_tokenizer = WhitespaceTokenizer() # using the 38 characters in one line rule from ITV subtitle guidelines characters_per_line = 38 lines_per_subtitle = 2 blocks = block_tokenizer.tokenize(input_text) for block in blocks: # We have one speaker sentences = sentence_tokenizer.tokenize(block) # We have the sentences for sentence in sentences: words = word_tokenizer.tokenize(sentence) reverse_words = words[::-1] lines = [] current_line = '' line_full = False while reverse_words: word = reverse_words.pop() longer_line = ' '.join([current_line, word]).strip() if len(longer_line) > characters_per_line and len(current_line): # The longer line is overreaching boundaries reverse_words.append(word) line_full = True elif len(word) >= characters_per_line: # Very long words current_line = longer_line line_full = True else: current_line = longer_line if line_full: lines.append(current_line) current_line = '' line_full = False if len(lines) >= lines_per_subtitle: end_list.append(lines) lines = [] if current_line: lines.append(current_line) if lines: end_list.append(lines) return end_list
def split_and_tokenize_reviews(self): # split sentences and tokenize each sentence to a list reviews = self.clean_and_separate_reviews() train_sentences = [] tokenizer = WhitespaceTokenizer() for review in reviews: sentences = re.split("[.?!]", str(review)) for sentence in sentences: train_sentences.append(tokenizer.tokenize(sentence)) print("train_sentences length %s" % len(train_sentences)) return train_sentences
def main(args): tokenizer = WhitespaceTokenizer() voc = set() dir = args.train_dir dir_pos = os.path.join(dir, 'pos') cnt = 0 fmt = 'Processed %d positive docs' for fname in os.listdir(dir_pos): if not fname.endswith('.txt'): continue cnt += 1 if cnt % REPORT_INTERVAL == 0: print fmt % cnt f = open(os.path.join(dir_pos, fname), 'rb') voc.update(map(lambda s: s.lower(), tokenizer.tokenize(f.read()))) f.close() print fmt % cnt dir_neg = os.path.join(dir, 'neg') cnt = 0 fmt = 'Processed %d negative docs' for fname in os.listdir(dir_neg): if not fname.endswith('.txt'): continue cnt += 1 if cnt % REPORT_INTERVAL == 0: print fmt % cnt f = open(os.path.join(dir_neg, fname), 'rb') voc.update(map(lambda s: s.lower(), tokenizer.tokenize(f.read()))) f.close() print fmt % cnt voc = sorted(list(voc)) f = open(args.output, 'wb') pickle.dump(voc, f) f.close()
def skip_grams(sequence_df, feature_size=100, window=4, min_activity_count=0, **kwargs): """Vectorizes sequences by blank space and returns skip gram features for each activity_ID in sequences Parameters ---------- sequence_df : dataframe Pandas dataframe (from activities.create_corpus func) containing sequences of activity_ID feature_size : integer (default=100) Number of dimensions or size of vector to produce for each activity window : integer (default=4) Size of context window for each activity min_activity_count : integer (default=0) Minimum number of activity instances to be considered Returns ------- dictionary of activity_IDs and corresponding features from word2vec skip grams model """ assert len( sequence_df ) > 0 and 'seq_str' in sequence_df.columns, "sequence_df must contain a 'seq_str' column to tokenize." try: feature_size, window, min_activity_count = int(feature_size), int( window), int(min_activity_count) except TypeError: print("feature_size, window, and min_activity_count must be integers.") tokenizer = WhitespaceTokenizer() tokenized_corpus = [ tokenizer.tokenize(sequence) for sequence in sequence_df['seq_str'] ] # Train model on corpus using skip-gram method w2v_model = word2vec.Word2Vec(tokenized_corpus, size=feature_size, window=window, min_count=min_activity_count, sg=1, **kwargs) #,sample=1e-5, iter=50) # Get unique list of activities vocab_activities = [k for k in w2v_model.wv.vocab.keys()] # Zip activity_ID and features from w2v model w2v_dict = dict(zip(vocab_activities, w2v_model.wv[vocab_activities])) return w2v_dict
def lemmatize_str(string, wordnet): ''' Lemmatize string using nltk WordNet Input: string Output: string ''' if wordnet: w_tokenizer = WhitespaceTokenizer() lemmatizer = WordNetLemmatizer() lemmed = " ".join( [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(string)]) return lemmed
def remove_low_freq_noverbs(text_to_modify): lists = df['Body'] words2 = [] for wordList in lists: words2 += wordList fdist = FreqDist(words2) fdist_min = sorted(w for w in set(words2) if fdist[w] < 21) sw_low_noverbs = set() sw_low_noverbs.update(fdist_min) stopwords = sw_low_noverbs tokenizer = WhitespaceTokenizer() words = tokenizer.tokenize(text_to_modify) return [word for word in words if word not in stopwords]
def stemming_documents(documents): whitespace_tokenizer = WhitespaceTokenizer() stemmer = PorterStemmer() stemmed_documents = [] for document in documents: sentence = ' '.join([ stemmer.stem(word.lower()) for word in whitespace_tokenizer.tokenize(document) ]) stemmed_documents.append(sentence) return np.array(stemmed_documents, dtype='object')
def lemmatize_series(series, lematize=False, spacy=False): if isinstance(series, pd.Series): series = series.copy() w_tokenizer = WhitespaceTokenizer() lemmatizer = WordNetLemmatizer() tokenize_lematize_word_list = [] for i in list(series): if lematize: if spacy == True: doc = nlp(i) tokenize_lematize_word_list.append( [token.lemma_ for token in doc]) else: tokenize_lematize_word_list.append([ lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(i) ]) else: tokenize_lematize_word_list.append( [w for w in w_tokenizer.tokenize(i)]) return pd.Series(tokenize_lematize_word_list) else: raise ValueError("Need pandas series as input")
def unidas(filename): chromeOptions = webdriver.ChromeOptions() prefs = {"profile.managed_default_content_settings.images":2} chromeOptions.add_experimental_option("prefs",prefs) chromeOptions.add_argument("--incognito") browser = webdriver.Chrome(chrome_options=chromeOptions) unidasURL = "https://www.seminovosunidas.com.br/veiculos" unidasSection = "/page:" tknzr = WhitespaceTokenizer() browser.get(unidasURL) page_soup = soup(browser.page_source, "html5lib") numberOfPages = str(page_soup.body.find("ul", {"class": "list-unstyled list-inline header-paginator pull-right"}).findAll("li")[4].find("a")) numberOfPages = numberOfPages[numberOfPages.find('">')+2:numberOfPages.find("</")] print(numberOfPages) filename = filename + ".csv" f = open(filename, "w") for i in range(1, int(numberOfPages)+1, 1): url = unidasURL + unidasSection + str(i) browser.get(url) page_soup = soup(browser.page_source, "html5lib") containers = page_soup.body.find("div", {"class": "container busca-resultados"}).find("div", {"class": "resultados"}).ul.findAll("li") for vehicle in containers: car = vehicle fabricante = car.find("article").find('div').find("div", {"class":"col-sm-6 col-md-9"}).find("span", {"class": "makeModel"}).findAll("span")[0].text modelo = car.find("article").find('div').find("div", {"class":"col-sm-6 col-md-9"}).find("span", {"class": "makeModel"}).findAll("span")[1].text ano = car.find("article").find('div').find("div", {"class":"col-sm-6 col-md-9"}).find("span", {"class": "description"}).text km = car.find("article").find('div').find("div", {"class":"col-sm-6 col-md-9"}).find("span", {"class": "details"}).text preco = car.find("article").find('div').find("div", {"class":"col-sm-6 col-md-9"}).find("span", {"class": "valor"}).text ano = tknzr.tokenize(ano) ano = ano[len(ano)-1].replace(")", "") km = km[km.find("Km: ") + 4:] km = km[:km.find(",")] preco = preco[:len(preco)-3] if fabricante == 'MERCEDES': fabricante = "MERCEDES-BENZ" print("Fabricante: " + fabricante) print("Modelo: " + modelo) print("Ano: " + ano) print("Quilometragem: " + km) print("Preço: " + preco) print("\n") f.write("Unidas," + fabricante + "," + modelo.replace(" ","").upper() + "," + ano + "," + km.replace(".", "") + "," + preco.replace(".", "") + "\n")
def get_words(document): ''' Return a list of unique words in document ''' regex1 = re.compile('\W') # match non-alphanumeric regex2 = re.compile('&(#)*(\w)*;') # match html entities regex3 = re.compile('( ){2,}') # match more than 2 spaces lemmatizer = WordNetLemmatizer() tokenizer = WhitespaceTokenizer() # lowercase document, remove punctuation, and html entities document = regex3.sub(' ', regex2.sub(' ', regex1.sub(' ', document.lower()))) words = [ lemmatizer.lemmatize(word) for word in tokenizer.tokenize(document) if word not in STOPWORDS and len(word) > 2 ] return FreqDist(words)
def preprocess_article_content(text_df): print 'preprocessing article text...' # text_df is data frame from SQL query, column 'content' contains text content from each article article_list = [] # define punctuation to remove punc = set('''`~!@#$%^&*()-_=+\|]}[{;:'",<.>/?''') tokenizer = WhitespaceTokenizer() stop_words = set(stopwords.words('english')) #stemmer = SnowballStemmer('english') lemmatizer = WordNetLemmatizer() kept_rows = [] for row, article in enumerate(text_df['content']): cleaned_tokens = [] tokens = tokenizer.tokenize( article.decode('unicode-escape', 'ignore').lower()) for token in tokens: token = ''.join(ch for ch in token if ch not in punc) if token not in stop_words: if len(token) > 0 and len(token) < 20: if not token[0].isdigit() and not token[-1].isdigit(): #stemmed_token = stemmer.stem(token) lemmatized_tokens = lemmatizer.lemmatize(token) #cleaned_tokens.append(stemmed_token) cleaned_tokens.append(lemmatized_tokens) # join cleaned tokens into a string for subsequent LDA # filtering out content that is likely noise (error messages etc) if len(cleaned_tokens) > 100: article_list.append(' '.join(wd for wd in cleaned_tokens)) kept_rows.append(row) print 'preprocessed content for %d articles' % len(article_list) return article_list, kept_rows
def CleanAndTokenize(text): # Strip URLs and replace with token "URLURLURL" r = re.compile( r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+" ) text = re.sub(r, " URLURLURL", text) # Strip html tags soup = BeautifulSoup(text) for tag in soup.findAll(True): tag.replaceWithChildren() text = soup.get_text() # Normalize everything to lower case text = text.lower() # Strip line breaks and endings \r \n r = re.compile(r"[\r\n]+") text = re.sub(r, "", text) # get rid of em dashes # table = { # ord(u'\u2018') : u"'", # ord(u'\u2019') : u"'", # ord(u'\u201C') : u'"', # ord(u'\u201d') : u'"', # ord(u'\u2026') : u'', # ord(u'\u2014') : u'', # } # text = text.translate(table) # Normalize contractions # e.g. can't => can not, it's => it is, he'll => he will text = NormalizeContraction(text) # Strip punctuation (except for a few) punctuations = string.punctuation # includes following characters: !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ excluded_punctuations = ["$", "%"] for p in punctuations: if p not in excluded_punctuations: text = text.replace(p, " ") # Condense double spaces text = text.replace(" ", " ") # Tokenize the text tokenizer = WhitespaceTokenizer() text_tokens = tokenizer.tokenize(text) return text_tokens
def preprocess_article_content(text_df): print 'preprocessing article text...' # text_df is data frame from SQL query, column 'content' contains text content from each article article_list = [] # define punctuation to remove punc=set('''`~!@#$%^&*()-_=+\|]}[{;:'",<.>/?''') tokenizer = WhitespaceTokenizer() stop_words = set(stopwords.words('english')) #stemmer = SnowballStemmer('english') lemmatizer = WordNetLemmatizer() kept_rows = [] for row, article in enumerate(text_df['content']): cleaned_tokens = [] tokens = tokenizer.tokenize(article.decode('unicode-escape', 'ignore').lower()) for token in tokens: token = ''.join(ch for ch in token if ch not in punc) if token not in stop_words: if len(token) > 0 and len(token) < 20: if not token[0].isdigit() and not token[-1].isdigit(): #stemmed_token = stemmer.stem(token) lemmatized_tokens = lemmatizer.lemmatize(token) #cleaned_tokens.append(stemmed_token) cleaned_tokens.append(lemmatized_tokens) # join cleaned tokens into a string for subsequent LDA # filtering out content that is likely noise (error messages etc) if len(cleaned_tokens) > 100: article_list.append(' '.join(wd for wd in cleaned_tokens)) kept_rows.append(row) print 'preprocessed content for %d articles' % len(article_list) return article_list, kept_rows
def extract(self, corpus): from nltk.stem import WordNetLemmatizer from nltk.corpus import stopwords from nltk.tokenize import WhitespaceTokenizer exclude_words = stopwords.words('english') exclude_words.append('rt') exclude_words.append('&') tok = WhitespaceTokenizer() lem = WordNetLemmatizer() tsents = [tok.tokenize(sent) for sent in corpus] norm_words = [] for sent in tsents: for word in sent: if word.startswith('http://'): continue nword = lem.lemmatize(word.lower()) if nword not in exclude_words: norm_words.append(nword) return nltk.FreqDist(norm_words)
def CleanAndTokenize(text): # Strip URLs and replace with token "URLURLURL" r = re.compile(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+") text = re.sub(r, " URLURLURL", text) # Strip html tags soup = BeautifulSoup(text, "html.parser") for tag in soup.findAll(True): tag.replaceWithChildren() text = soup.get_text() # Normalize everything to lower case text = text.lower() # Strip line breaks and endings \r \n r = re.compile(r"[\r\n]+") text = re.sub(r, "", text) # get rid of em dashes # table = { # ord(u'\u2018') : u"'", # ord(u'\u2019') : u"'", # ord(u'\u201C') : u'"', # ord(u'\u201d') : u'"', # ord(u'\u2026') : u'', # ord(u'\u2014') : u'', # } # text = text.translate(table) # Normalize contractions # e.g. can't => can not, it's => it is, he'll => he will text = NormalizeContraction(text) # Strip punctuation (except for a few) punctuations = string.punctuation # includes following characters: !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ excluded_punctuations = ["$", "%"] for p in punctuations: if p not in excluded_punctuations: text = text.replace(p, " ") # Condense double spaces text = text.replace(" ", " ") # Tokenize the text tokenizer = WhitespaceTokenizer() text_tokens = tokenizer.tokenize(text) return text_tokens
def build_word_frequency(filepath: Path, language: str): """ Parse the passed in text file (likely from Open Subtitles) into a word frequency list and write it out to disk Args: filepath (Path): language (str): Returns: Counter: The word frequency as parsed from the file Note: This only removes words that are proper nouns (attempts to...) and anything that starts or stops with something that is not in the alphabet. """ try: from nltk.tag import pos_tag from nltk.tokenize import WhitespaceTokenizer from nltk.tokenize.toktok import ToktokTokenizer except ImportError as ex: raise ImportError("To build a dictioary from scratch, NLTK is required!\n{}".format(ex.message)) word_frequency = Counter() tok = WhitespaceTokenizer() with open(filepath, mode="r") as fobj: for line in tqdm.tqdm(fobj): # tokenize into parts parts = tok.tokenize(line) # Attempt to remove proper nouns # Remove things that have leading or trailing non-alphabetic characters. tagged_sent = pos_tag(parts) words = [ word[0].lower() for word in tagged_sent if word[0] and not word[1] == "NNP" and word[0][0].isalpha() and word[0][-1].isalpha() ] if words: word_frequency.update(words) return word_frequency
def tokenize(sent,tokenizer_type): #tokenizer_type is [0] the tokenizer [1] the REGEX or '' tokenizer = 'not_implemented' #split on custom is the only non-nltk tokenizer if tokenizer_type == 'split_on_custom': return [sent.split(tokenizer_type[1]) for sent in sents] if tokenizer_type[0] == 'whitespace': tokenizer = WhitespaceTokenizer() if tokenizer_type[0] == 'wordpunkt': tokenizer = WordPunctTokenizer() if tokenizer_type[0] == 'regexp': tokenizer = RegexpTokenizer(tokenizer_type[1]) if tokenizer_type[0] == 'treebank': tokenizer = TreebankWordTokenizer() try: if tokenizer != "not_implemented": return tokenizer.tokenize(sent) else: return 'Tokenizer not implemented' except ValueError: #if the input is not a list of strings pass
def instanciate_dict(message): tk = WhitespaceTokenizer() tokens = tk.tokenize(message) all_counts = dict() sorted_dict = dict() for size in 1, 2, 3: all_counts[size] = FreqDist(ngrams(tokens, size)) for index in range(1, 4): all_counts[index] = { k: v for k, v in all_counts[index].items() if v >= 2 } sorted_dict[index] = dict( sorted(all_counts[index].items(), key=operator.itemgetter(1), reverse=True)) return sorted_dict
def buildVocab(self): self.vocabSize = int(self.vocabSize) print ("Building vocab from frequencies") # get tokenized corpus and get word counts self.tokenizedCorpus = [] self.vocabSet = set() tokenizer = WhitespaceTokenizer() for doc in self.corpus: # tokenize doc docTokens = tokenizer.tokenize(doc) self.tokenizedCorpus.extend(docTokens) print (" Tokenized corpus = ", len(self.tokenizedCorpus)) # vocab for entire corpus self.fullVocab = set(self.tokenizedCorpus) print (" Full vocab = ", len(self.fullVocab)) self.vocabCounts = {} # Extremely inefficient since has to iterate entire corpus for each word # generate counts for each word #for w in self.fullVocab: # self.vocabCounts[w] = self.tokenizedCorpus.count(w) # for each word in corpus for w in self.tokenizedCorpus: if w in self.vocabCounts: self.vocabCounts[w] += 1 else: self.vocabCounts[w] = 1 # sort counts with most frequent first sortedCounts = sorted(self.vocabCounts.items(), key=operator.itemgetter(1), reverse=True) # generate vocab from first vocabSize words vocabCounts = sortedCounts[0:self.vocabSize] self.vocab = [e[0] for e in vocabCounts] print (" vocab = ", self.vocab)
def bag_of_words(voc, doc, handle_negation=False, handle_bigrams=False): """ Generate bag of words according to dictionary. Haven'd done sanity check on dictionary. Please make each word in dictionary unique and sorted. :param voc: list of words :param doc: string :return: list of feature vector. 0 as not appearing. 1 as appearing positive. -1 as appearing negative. Has the same size of dictionary. """ tokenizer = WhitespaceTokenizer() tokens = tokenizer.tokenize(doc) fv = np.zeros_like(voc, np.int8) is_previous_negative = False is_previous_enhanced = False for token in tokens: word = token.lower() if is_skip_word(word): continue if is_negative(word): is_previous_negative = True continue if is_degree(word): is_previous_enhanced = True continue try: idx = voc.index(word) fv[idx] = 1 fv[idx] *= -1 if handle_negation and is_previous_negative else 1 fv[idx] *= 2 if handle_bigrams and is_previous_enhanced else 1 except ValueError, e: pass is_previous_negative = False is_previous_enhanced = False
def tokenize(self): ''' tokenize, filter numbers and remove links and save in self.to_write :return: ''' print("Tokenizing") tokenizer2 = RegexpTokenizer(r'\w+') tokenizer1 = WhitespaceTokenizer() tokens = [] for i in range(len(self.texts)): raw = self.texts[i].lower() # white space tokenize token = tokenizer1.tokenize(raw) # extending contractions for i in range(0, len(token)): if token[i] in contractions.keys(): token[i] = contractions[str(token[i])] # removing links if (re.search('http', token[i])): token[i] = '' raw = " ".join(token) # regex tokenizing token = tokenizer2.tokenize(raw) for i in range(0, len(token)): if token[i].isalnum() == False: token[i] = '' if (token[i] not in self.unique_words): self.vocab_size += 1 self.unique_words.append(token[i]) self.words.append(token[i]) tokens.append(token) raw = " ".join(token) self.to_write.append(raw) return tokens
def CleanAndTokenize(text): # Strip URLs and replace with token "URLURLURL" r = re.compile(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+") text = re.sub(r, " URLURLURL", text) # Strip html tags soup = BeautifulSoup(text) for tag in soup.findAll(True): tag.replaceWithChildren() text = soup.get_text() # Normalize everything to lower case text = text.lower() # Strip line breaks and endings \r \n r = re.compile(r"[\r\n]+") text = re.sub(r, "", text) table = { ord(u'\u2018') : u"'", ord(u'\u2019') : u"'", ord(u'\u201C') : u'"', ord(u'\u201d') : u'"', ord(u'\u2026') : u'', ord(u'\u2014') : u'', # get rid of em dashes } text = text.translate(table) # Normalize contractions # e.g. can't => can not, it's => it is, he'll => he will text = text.replace("can't", "can not") text = text.replace("couldn't", "could not") text = text.replace("don't", "do not") text = text.replace("didn't", "did not") text = text.replace("doesn't", "does not") text = text.replace("shouldn't", "should not") text = text.replace("haven't", "have not") text = text.replace("aren't", "are not") text = text.replace("weren't", "were not") text = text.replace("wouldn't", "would not") text = text.replace("hasn't", "has not") text = text.replace("hadn't", "had not") text = text.replace("won't", "will not") text = text.replace("wasn't", "was not") text = text.replace("can't", "can not") text = text.replace("isn't", "is not") text = text.replace("ain't", "is not") text = text.replace("it's", "it is") text = text.replace("i'm", "i am") text = text.replace("i'm", "i am") text = text.replace("i've", "i have") text = text.replace("i'll", "i will") text = text.replace("i'd", "i would") text = text.replace("we've", "we have") text = text.replace("we'll", "we will") text = text.replace("we'd", "we would") text = text.replace("we're", "we are") text = text.replace("you've", "you have") text = text.replace("you'll", "you will") text = text.replace("you'd", "you would") text = text.replace("you're", "you are") text = text.replace("he'll", "he will") text = text.replace("he'd", "he would") text = text.replace("he's", "he has") text = text.replace("she'll", "she will") text = text.replace("she'd", "she would") text = text.replace("she's", "she has") text = text.replace("they've", "they have") text = text.replace("they'll", "they will") text = text.replace("they'd", "they would") text = text.replace("they're", "they are") text = text.replace("that'll", "that will") text = text.replace("that's", "that is") text = text.replace("there's", "there is") # Strip punctuation (except for a few) punctuations = string.punctuation # includes following characters: !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ excluded_punctuations = ["$", "%"] for p in punctuations: if p not in excluded_punctuations: text = text.replace(p, " ") # Condense double spaces text = text.replace(" ", " ") # Tokenize the text # NOTE: Using a simple tokenizer based on spaces ... # Could also try a more sophisticated tokenizer if abbreviations / contractions should be conserved tokenizer = WhitespaceTokenizer() text_tokens = tokenizer.tokenize(text) return text_tokens
for f in listdir('corpus/'): if f[-4:] == ".txt" and not f in skipOver: fileName = f F = open('corpus/'+f) text = F.read() F.close() alphanum = letters+octdigits paragraphs = [s for s in text.split("\n\n") if s != "" ][:-1] numParagraphs = len(paragraphs) # average paragraph size wst = WhitespaceTokenizer() paraWordCounts = [len(wst.tokenize(p)) for p in paragraphs] # the approximate number of words in the document numWords = sum(paraWordCounts) # the average number of words per paragraph avgParagraphLen = mean(paraWordCounts) # rejoin the paragraphs text = ' '.join(paragraphs) # part of speech word list for the text text = [word for subl in [pos_tag(wt(s)) for s in st(text)] for word in subl] # remove symbols from list by checking the first character of the word text = [word for word in text if word[0][0] in alphanum]
from nltk.tokenize import WhitespaceTokenizer TOKENIZER = WhitespaceTokenizer() def read(file_name): try: f_in = '%s.txt' % file_name file_in = open(f_in, 'r') f_out = '%s.csv' % file_name file_out = open(f_out, 'wb') except Exception, e: raise e data = ', '.join( [TOKENIZER.tokenize(line)[1] for line in file_in] ) try: file_out.write(data) except Exception, e: raise e #read() if __name__ == "__main__": # Command line arguments import argparse parser = argparse.ArgumentParser( description='Converts a space two column space separted file into csv containing second column' ) parser.add_argument('file', help='The file to convert') args = parser.parse_args()
def tokenize_text(text): whitespace_tokenizer = WhitespaceTokenizer() return whitespace_tokenizer.tokenize(text)
class Prototype: """Prototype system that searches for RDF pattern (aka Q-Calculus pattern) to find textsnippets.""" def __init__(self, mongo_db, postgre_db, sentence_mode=True, punctuation_mode=False, window_size=0): """Initialize a prototype with a specified configurations. Parameters: mongo_db -- Mongo DB connection postgre_db -- PostGre DB connection sentence_mode -- whether or not to use sentence window mode (default True) window_size -- the size of the sentence or word window (default 0) """ self.__mongo_db = mongo_db self.__postgre_db = postgre_db self.__sentence_mode = sentence_mode self.___punctuation_mode = punctuation_mode self.__window_size = window_size self.tokenizer = WhitespaceTokenizer() self.parser = Parser() def exit(self): """Close down the prototype.""" self.__mongo_db.close_connection() self.__postgre_db.close_connection() def create_new_collection(self, schema_name): self.__postgre_db.create_schema(schema_name) def get_window_size(self): """Gets the current window size.""" return self.__window_size def get_sentence_mode(self): """Returns True if sentence window mode is activated, else False.""" return self.__sentence_mode def change_window_size(self, size): """Change the current window size to a new size.""" value = 0 try: value = int(size) except ValueError: raise ValueError("Please type in a valid number.") if value >= 0: self.__window_size = value else: raise ValueError("Please type in a valid positive number.") def activate_sentence_window_mode(self): """Activate sentence window mode.""" self.__sentence_mode = True def activate_word_window_mode(self): """De-activate sentence window mode.""" self.__sentence_mode = False def activate_punctuation_mode(self): self.___punctuation_mode = True def deactivate_punctuation_mode(self): self.___punctuation_mode = False def get_punctuation_mode(self): return self.___punctuation_mode def get_word_window(self, pattern, tokens, constraints): """Get a word window list with a specific number of words. Parameters: pattern -- the pattern to search for tokens -- the tokens to search in constraints -- a constraint tuple list """ split_pattern = pattern.split() if len(split_pattern) > 1: textsnippets = self.__get_word_window_more_words_help(split_pattern, tokens, constraints) else: textsnippets = self.__get_word_window_one_word_help(pattern, tokens, constraints) return textsnippets def __get_word_window_more_words_help(self, split_pattern, tokens, constraints): """Find pattern with more than one word. """ textsnippets = [] textlength = len(tokens) for ind, token in enumerate(tokens): p_index = 0 end_index = ind while p_index < len(split_pattern): if self.check_pattern(split_pattern[p_index], tokens[end_index]): p_index += 1 end_index += 1 else: break if p_index == len(split_pattern): if constraints is not None: self.__check_constraints(constraints, (ind, end_index - 1), ind, split_pattern, None, None, textsnippets, tokens) else: pattern = " ".join(item for item in split_pattern) self.__get_word_window_help((ind, end_index - 1), textsnippets, textlength, tokens, pattern) return textsnippets def __get_word_window_one_word_help(self, pattern, tokens, constraints): """Find pattern with only one word.""" textsnippets = [] textlength = len(tokens) for ind, token in enumerate(tokens): if self.check_pattern(pattern, token): if constraints is not None: self.__check_constraints(constraints, (ind, ind), ind, pattern, None, None, textsnippets, tokens) else: self.__get_word_window_help((ind, ind), textsnippets, textlength, tokens, pattern) return textsnippets def __get_word_window_help(self, token_pos, textsnippets, textlength, tokens, pattern): snippet = self.__get_textsnippets(token_pos[0], token_pos[1], textlength, tokens) offset_start = re.search(pattern, snippet).span()[0] offset_end = offset_start + (len(pattern) - 1) SentObj = namedtuple('Sentence_Object', ['snippet', 'offset_start', 'offset_end']) textsnippets.append(SentObj(snippet=snippet, offset_start=offset_start, offset_end=offset_end)) def __get_textsnippets(self, indl, indr, textlength, tokens): if (indl - self.__window_size < 0) and (indr + self.__window_size > textlength): left_index = self.__window_size - 1 while not (indl - left_index) == 0: left_index -= 1 right_index = self.__window_size - 1 while not (indr + right_index) == textlength: right_index -= 1 return " ".join(tokens[indl - left_index:indr + right_index]) elif indr + self.__window_size > textlength: right_index = self.__window_size - 1 while not (indr + right_index) == textlength: right_index -= 1 return " ".join(tokens[indl - self.__window_size:indr + right_index]) elif indl - self.__window_size < 0: left_index = self.__window_size - 1 while not (indl - left_index) == 0: left_index -= 1 return " ".join(tokens[indl - left_index:indr + self.__window_size + 1]) else: return " ".join(tokens[indl - self.__window_size:indr + (self.__window_size + 1)]) def get_sentence_window(self, pattern, sentences, constraints): """Get a list with a specific number of sentences. size 0 will return the current sentence the pattern is found in. size n will return n sentences left and right from the initial sentence. Parameters: pattern -- the pattern to search for sentences -- the sentences to search in constraints -- the constraint tuple list """ split_pattern = pattern.split() if len(split_pattern) > 1: textsnippets = self.__get_sentence_window_more_words(split_pattern, sentences, constraints) else: textsnippets = self.__get_sentence_window_one_word(pattern, sentences, constraints) return textsnippets def __get_sentence_window_one_word(self, pattern, sentences, constraints): """Get sentence snippets with pattern containing of only one words according to window size.""" textsnippets = [] for ind, sent in enumerate(sentences): tokens = self.tokenizer.tokenize(sent) for i, token in enumerate(tokens): if self.check_pattern(pattern, token): if constraints is not None: self.__check_constraints(constraints, (i, i), ind, pattern, sent, sentences, textsnippets, tokens) else: self.__get_sentence_window_help(ind, sentences, textsnippets, pattern) return textsnippets def __check_constraints(self, constraints, token_pos, sent_num, pattern, sent, sentences, textsnippets, tokens): """Traverse the given list of constraints and find target words near the keyword. The number of word distance is given in the constraint list. add_info[0] is the keyword aka pattern. add_info[1] is the target_word aka the constraint. add_info[2] is the word distance from constraint to the pattern.""" pos = 0 more_words_flag = False if token_pos[0] == token_pos[1]: pos = token_pos[0] else: more_words_flag = True for add_info in constraints: # find pattern that matches target word index = add_info[2] found_constraint_flag = True if more_words_flag: constraint = add_info[0].split() i = 0 while found_constraint_flag and i < len(pattern) and i < len(constraint): if self.check_pattern(pattern[i], constraint[i]): pass else: found_constraint_flag = False break i += 1 if found_constraint_flag or self.check_pattern(pattern, add_info[0]): # set token_pos depending if index is positive or negative if more_words_flag and index > 0: pos = token_pos[1] elif more_words_flag and index < 0: pos = token_pos[0] if self.__sentence_mode: if (0 <= pos + index < len(tokens)) and self.check_pattern(add_info[1], tokens[pos + index]): self.__get_sentence_window_help(sent_num, sentences, textsnippets, pattern) else: while index != 0: if index > 0: index -= 1 else: index += 1 if (0 < pos + index < len(tokens)) and self.check_pattern(add_info[1], tokens[pos + index]): self.__get_sentence_window_help(sent_num, sentences, textsnippets, pattern) break else: if (0 <= pos + index < len(tokens)) and self.check_pattern(add_info[1], tokens[pos + index]): self.__get_word_window_help(token_pos, textsnippets, len(tokens), tokens, pattern) else: while index != 0: if index > 0: index -= 1 else: index += 1 if (0 < pos + index < len(tokens)) and self.check_pattern(add_info[1], tokens[pos + index]): self.__get_word_window_help(token_pos, textsnippets, sent, tokens, pattern) break def __get_sentence_window_help(self, ind, sentences, textsnippets, pattern): sentence = self.__get_sentences(ind, sentences) # get offsets offset_start = re.search(pattern, sentence).span()[0] offset_end = offset_start + (len(pattern) - 1) SentObj = namedtuple('Sentence_Object', ['snippet', 'offset_start', 'offset_end']) textsnippets.append(SentObj(snippet=sentence, offset_start=offset_start, offset_end=offset_end)) def __get_sentence_window_more_words(self, split_pattern, sentences, constraints): """Get sentence snippets with pattern containing of more than 2 words according to window size.""" textsnippets = [] for ind, sent in enumerate(sentences): tokens = self.tokenizer.tokenize(sent) p_index = 0 begin_index = 0 end_index = 0 while p_index < len(split_pattern): if (end_index < len(tokens)) and self.check_pattern(split_pattern[p_index], tokens[end_index]): if p_index == 0: begin_index = end_index else: begin_index = begin_index + end_index - end_index p_index += 1 end_index += 1 else: break end_index -= 1 if p_index == len(split_pattern): # search for constraints in sentence if constraints is not None: self.__check_constraints(constraints, (begin_index, end_index), ind, split_pattern, sent, sentences, textsnippets, tokens) else: pattern = " ".join(item for item in split_pattern) self.__get_sentence_window_help(ind, sentences, textsnippets, pattern) return textsnippets def __get_sentences(self, ind, sentences): if self.__window_size == 0: return sentences[ind] elif self.__window_size > 0: left_window_border = ind - self.__window_size right_window_border = ind + self.__window_size + 1 if left_window_border < 0: left_window_border = 0 if right_window_border >= len(sentences): right_window_border = len(sentences) return " ".join(sentences[left_window_border:right_window_border]) def find_text_window(self, schema, text, text_id, constraints=None): """Finds text windows with variable size and pushes the found results in the PostGre database. Parameters: text -- text to search in text_id -- id of the text constraints -- the constraint tuple list""" # this is only a quick and dirty fix: replace weird quotes to basic ones for ch in ['›', '‹', '»', '«']: if ch in text: text = text.replace(ch, '"') tokenized_text = self.tokenizer.tokenize(text) if self.___punctuation_mode: punctuation_text = re.split('[!?.,;:]', text) punctuation_text = [item for item in punctuation_text if item != ''] for pattern in self.__postgre_db.get_data_from_table(schema, "single_pattern"): if self.___punctuation_mode and self.__sentence_mode: windows_objects = self.get_sentence_window( pattern['single_pattern'], punctuation_text, constraints) elif self.__sentence_mode: windows_objects = self.get_sentence_window( pattern['single_pattern'], sent_tokenize(text, language='german'), constraints) else: windows_objects = self.get_word_window(pattern['single_pattern'], tokenized_text, constraints) # push found snippets onto database if len(windows_objects) > 0: single_pattern_id = pattern['id'] for sent_obj in windows_objects: # push snippets self.__push_snippets(schema, sent_obj.snippet) snippet_id = self.__postgre_db.get_id(schema,"snippets", "snippet=" + add_quotes( replace_special_characters(sent_obj.snippet))) # push relations self.__push_texts_snippets(schema, text_id, snippet_id) self.__push_snippet_offsets(schema, single_pattern_id, snippet_id, sent_obj.offset_start, sent_obj.offset_end) def __push_snippets(self, schema, snippet): """Push found snippets onto the snippets table in PostGre DB, if not already in the table. Afterwards push the single_pattern and snippets relation.""" if not self.__postgre_db.is_in_table(schema, "snippets", "snippet=" + add_quotes( replace_special_characters(snippet))): self.__postgre_db.insert(schema,"snippets", {"snippet": snippet}) def __push_texts_snippets(self, schema, text_id, snippet_id): """Get all saved snippets that occur in a text and push them onto PostGre DB.""" self.__push_relation(schema, text_id, snippet_id, "text_id", "snippet_id", "texts_snippets") def __push_snippet_offsets(self, schema, single_pattern_id, snippet_id, offset_start, offset_end): """Push found single_pattern in snippets and their respective offset.""" if not self.__postgre_db.is_in_table( schema, "snippet_offsets", "single_pattern_id=" + str(single_pattern_id) + " and snippet_id=" + str( snippet_id)): self.__postgre_db.insert(schema, "snippet_offsets", { "single_pattern_id": single_pattern_id, "snippet_id": snippet_id, "offsets": [ [offset_start, offset_end]]}) else: old_list = self.__postgre_db.get(schema, "snippet_offsets", "single_pattern_id=" + str( single_pattern_id) + " and snippet_id=" + str(snippet_id), "offsets") old_list.append([offset_start, offset_end]) pid = self.__postgre_db.get_id(schema, "snippet_offsets", "single_pattern_id=" + str( single_pattern_id) + " and snippet_id=" + str(snippet_id)) self.__postgre_db.update(schema, "snippet_offsets", "offsets=" + add_quotes(replace_brackets(str( old_list))), "id=" + str(pid)) def __push_relation(self, schema, id1, id2, id1_name, id2_name, table): """Push a relation onto the PostGre DB. The relation has to have a primary key.""" # case: No entry about relation is in DB yet if not self.__postgre_db.is_in_table(schema, table, id1_name + "=" + str( id1)): self.__postgre_db.insert(schema, table, { id1_name: id1, id2_name: [id2], "aggregation": 0}) # case: Entry about single_pattern is in DB else: old_list = self.__postgre_db.get(schema, table, id1_name + "=" + str( id1), id2_name) new_list = list(set(old_list + [id2])) self.__postgre_db.update(schema, table, id2_name + "=" + add_quotes(replace_brackets(str( new_list))), id1_name + "=" + str(id1)) def __push_aggregation_lowest_layer(self, schema, aggregation_object, aggregation_name, table, id_name): """Push the aggregated snippet numbers onto corresponding the lower layer tables.""" for aggregation in aggregation_object: id = aggregation[aggregation_name][0] aggregation_value = aggregation[aggregation_name][1] self.__postgre_db.update(schema, table, "aggregation=" + str(aggregation_value), id_name + "=" + str(id)) def __push_aggregation(self, schema, table, sub_table, table_id, sub_table_id): """Calculate and push aggregation on the rest layer tables.""" table_entries = self.__postgre_db.get_data_from_table(schema, table) for entry in table_entries: aggregation = 0 entry_id = entry[table_id] entries_to_look_up = entry[sub_table_id] for look_up in entries_to_look_up: query = "SELECT SUM(aggregation) FROM " + schema + "." + sub_table + " WHERE " + sub_table_id + "=" + str(look_up) stored_value = self.__postgre_db.query(query)[0]['sum'] if stored_value is None: stored_value = 0 aggregation += stored_value self.__postgre_db.update(schema, table, "aggregation=" + str(aggregation), table_id + "=" + str(entry_id)) def get_snippets(self, schema, constraints): """Get snippets for the whole corpus. Parameter: constraints -- the constraint tuple list""" for ind, text in enumerate(self.__mongo_db.get(schema, {})): self.__postgre_db.insert(schema, "texts", {"title": text['title']}) self.find_text_window(schema, text['text'], text['id'], constraints) print("Finished extracting snippets from chapter " + str(text['id']) + ".") def aggregation(self, schema): """Calculate aggregation bottom-up and store the interim data onto the database.""" aggregation_texts_snippets = self.__postgre_db.query("SELECT " + schema + ".aggregate_texts_snippets()") aggregation_snippet_offsets = self.__postgre_db.query("SELECT " + schema + ".aggregate_snippet_offsets()") # push 2 lowest levels of the hierarchy self.__push_aggregation_lowest_layer(schema, aggregation_texts_snippets, str('aggregate_texts_snippets'), "texts_snippets", "text_id") self.__push_aggregation_lowest_layer(schema, aggregation_snippet_offsets, str('aggregate_snippet_offsets'), "snippet_offsets", "id") # push rest of the hierarchy self.__push_aggregation(schema, "pattern_single_pattern", "snippet_offsets", str('pattern_id'), str('single_pattern_id')) self.__push_aggregation(schema, "has_object", "pattern_single_pattern", str('bscale_id'), str('pattern_id')) self.__push_aggregation(schema, "has_attribute", "has_object", str('bsort_id'), str('bscale_id')) def aggregate_bscale(self, schema, new_bscale, bsort, scale_type, *args): pattern_info = self.__add_new_bscale(schema, new_bscale, bsort, scale_type, *args) if pattern_info is not None: pattern_ids = pattern_info[0] new_bscale_id = pattern_info[1] new_pattern_list = list(set.union(*[set(item) for item in pattern_ids])) aggregation = 0 for item in new_pattern_list: aggregation += self.__postgre_db.get(schema, "pattern_single_pattern", "pattern_id=" + str(item), "aggregation") self.__postgre_db.insert(schema, "has_object", {"bscale_id": new_bscale_id, "pattern_id": new_pattern_list, "aggregation": aggregation}) def intersect_bscale(self, schema, new_bscale, bsort, scale_type, *args): pattern_info = self.__add_new_bscale(schema, new_bscale, bsort, scale_type, *args) if pattern_info is not None: pattern_ids = pattern_info[0] new_bscale_id = pattern_info[1] new_pattern_list = list(set.intersection(*[set(item) for item in pattern_ids])) aggregation = 0 for item in new_pattern_list: aggregation += self.__postgre_db.get(schema, "pattern_single_pattern", "pattern_id=" + str(item), "aggregation") self.__postgre_db.insert(schema, "has_object", {"bscale_id": new_bscale_id, "pattern_id": new_pattern_list, "aggregation": aggregation}) def __add_new_bscale(self, schema, new_bscale, bsort, scale_type, *args): if args is not None: bscale_table = self.__postgre_db.get_data_from_table(schema, "bscale") bscale_ids = [] for scale in args: scale_found = False for bscale in bscale_table: if scale == bscale['bscale']: bscale_ids.append(bscale['id']) scale_found = True if not scale_found: raise Exception("Chosen Bscale does not exist.") if not self.__postgre_db.is_in_table(schema, "bscale", "bscale=" + add_quotes(new_bscale)): self.__postgre_db.insert(schema, "bscale", {"bscale": new_bscale, "nominal": False, "ordinal": False, "interval": False}) new_bscale_id = self.__postgre_db.get_id(schema, "bscale", "bscale=" + add_quotes(new_bscale)) self.__postgre_db.update(schema, "bscale", scale_type + "=" + add_quotes('True'), "id=" + str(new_bscale_id)) bsort_id = self.__postgre_db.get_id(schema, "bsort", "bsort=" + add_quotes(bsort)) if self.__postgre_db.is_in_table(schema, "has_attribute", "bsort_id=" + str(bsort_id)): old_list = self.__postgre_db.get(schema, "has_attribute", "bsort_id=" + str(bsort_id), "bscale_id") old_list.append(new_bscale_id) self.__postgre_db.update(schema, "has_attribute", "bscale_id=" + add_quotes( replace_brackets(str(old_list))), "bsort_id=" + str(bsort_id)) else: self.__postgre_db.insert(schema, "has_attribute", {"bsort_id": bsort_id, "bscale_id": [new_bscale_id], "aggregation": 0}) scale_obj = self.__postgre_db.get_data_from_table(schema, "has_object") pattern_ids = [] for scale_id in bscale_ids: for item in scale_obj: if scale_id == item['bscale_id']: pattern_ids.append(item['pattern_id']) return (pattern_ids, new_bscale_id) def find_correlating_pattern(self, schema): all_snippets_table = self.__postgre_db.get_data_from_table(schema, "snippets") all_snippets = [snippet['snippet'] for snippet in all_snippets_table] all_bscales_table = self.__postgre_db.get_data_from_table(schema, "bscale") all_bscales = [bscale['id'] for bscale in all_bscales_table] for bscale_id in all_bscales: pattern_list = self.__postgre_db.get(schema, "has_object", "bscale_id=" + str(bscale_id), "pattern_id") for pattern_id in pattern_list: single_pattern_id_list = self.__postgre_db.get( schema, "pattern_single_pattern", "pattern_id=" + str(pattern_id), "single_pattern_id") for single_pattern_id in single_pattern_id_list: single_pattern = self.__postgre_db.get(schema, "single_pattern", "id=" + str(single_pattern_id), "single_pattern") self.__postgre_db.insert(schema, "bscale_single_pattern", {"bscale_id": bscale_id, "single_pattern_id": single_pattern_id, "single_pattern": single_pattern , "count": 0}) for snippet in self.parser.nlp.pipe(all_snippets, batch_size=3000, n_threads=-1): correlating_pattern = self.parser.get_correlating_nouns_and_adjectives(snippet) for ind, item in enumerate(correlating_pattern): if self.__postgre_db.is_in_table(schema, "bscale_single_pattern", "single_pattern=" + add_quotes(item)): pattern_id = self.__postgre_db.get(schema, "bscale_single_pattern", "single_pattern=" + str(add_quotes(item)), "single_pattern_id") index = ind + 1 while index < len(correlating_pattern): next_item = correlating_pattern[index] if self.__postgre_db.is_in_table(schema, "bscale_single_pattern", "single_pattern=" + add_quotes(next_item)): pattern_next_item_id = self.__postgre_db.get(schema, "bscale_single_pattern", "single_pattern=" + str(add_quotes(next_item)), "single_pattern_id") if pattern_id != pattern_next_item_id: first_combination_in_table = self.__postgre_db.is_in_table( schema, "correlating_pattern", "pattern_a=" + str(pattern_id) + " and pattern_b=" + str(pattern_next_item_id)) second_combination_in_table = self.__postgre_db.is_in_table( schema, "correlating_pattern", "pattern_a=" + str(pattern_next_item_id) + " and pattern_b=" + str(pattern_id)) # update entry if already exists in table correlating_pattern if first_combination_in_table: old_count = self.__postgre_db.get(schema, "correlating_pattern", "pattern_a=" + str(pattern_id) + " and pattern_b=" + str(pattern_next_item_id), "count") new_count = old_count + 1 self.__postgre_db.update(schema, "correlating_pattern", "count=" + str(new_count), "pattern_a=" + str(pattern_id) + " and pattern_b=" + str(pattern_next_item_id)) elif second_combination_in_table: old_count = self.__postgre_db.get(schema, "correlating_pattern", "pattern_a=" + str( pattern_next_item_id) + " and pattern_b=" + str(pattern_id), "count") new_count = old_count + 1 self.__postgre_db.update(schema, "correlating_pattern", "count=" + str(new_count), "pattern_a=" + str(pattern_next_item_id) + " and pattern_b=" + str( pattern_id)) else: # create new entry for pattern pair if none exists self.__postgre_db.insert(schema, "correlating_pattern", { "pattern_a": pattern_id, "pattern_b": pattern_next_item_id, "count": 1}) index += 1 def find_spo_and_adjectives(self, schema): all_snippets_table = self.__postgre_db.get_data_from_table(schema, "snippets") all_snippets = [snippet['snippet'] for snippet in all_snippets_table] for snippet in self.parser.nlp.pipe(all_snippets, batch_size=3000, n_threads=-1): spo = self.parser.get_SVO(snippet) for item in spo: if item is not None: # subject is pattern if item.subject != "'": if self.__postgre_db.is_in_table(schema, "single_pattern", "single_pattern=" + add_quotes(item.subject)): self.push_parser_items(schema, item.subject, "subject_occ", "subject") self.push_parser_items(schema, item.verb, "verb_occ", "verb") self.push_parser_item_relationship( schema, item.subject, item.verb, "subject_verb_occ", "subject", "verb") if item.object != '': self.push_parser_items(schema, item.object, "object_occ", "object") self.push_parser_item_relationship(schema, item.subject, item.object, "subject_object_occ", "subject", "object") #object is pattern elif self.__postgre_db.is_in_table(schema, "single_pattern", "single_pattern=" + add_quotes(item.object)): self.push_parser_items(schema, item.object, "object_occ", "object") self.push_parser_items(schema, item.verb, "verb_occ", "verb") self.push_parser_item_relationship(schema, item.object, item.verb, "object_verb_occ", "object", "verb") if item.subject != '': self.push_parser_items(schema, item.subject, "subject_occ", "subject") self.push_parser_item_relationship(schema, item.subject, item.object, "subject_object_occ", "subject", "object") noun_adjectives = self.parser.nouns_adj_spacy(snippet) for item in noun_adjectives: subject = item['noun'] adjective = item['adj'] if self.__postgre_db.is_in_table( schema, "single_pattern", "single_pattern=" + add_quotes(item['noun'])): self.push_parser_items(schema, subject, "subject_occ", "subject") self.push_parser_items(schema, adjective, "adjective_occ", "adjective") self.push_parser_item_relationship( schema, subject, adjective, "subject_adjective_occ", "subject", "adjective") def push_parser_items(self, schema, word, table, word_type): if not self.__postgre_db.is_in_table(schema, table, word_type + "=" + add_quotes(word)): self.__postgre_db.insert(schema, table, {word_type: word, "count": 0}) def push_parser_item_relationship(self, schema, word1, word2, table, word_type1, word_type2): word1_id = self.__postgre_db.get_id(schema, word_type1 + "_occ", word_type1 + "=" + add_quotes(word1)) word2_id = self.__postgre_db.get_id(schema, word_type2 + "_occ", word_type2 + "=" + add_quotes(word2)) if not self.__postgre_db.is_in_table(schema, table, word_type1 + "=" + str( word1_id) + " and " + word_type2 + "=" + str(word2_id)): self.__postgre_db.insert(schema, table, {word_type1: word1_id, word_type2: word2_id, "count": 1}) else: table_id = self.__postgre_db.get_id(schema, table, word_type1 + "=" + str(word1_id) + " and " + word_type2 + "=" + str(word2_id)) old_count = self.__postgre_db.get(schema, table, "id=" + str(table_id), "count") self.__postgre_db.update(schema, table, "count=" + str(old_count + 1), "id=" + str(table_id)) def aggregate_occurences_help(self, text_counter, word): count = text_counter[word] if count == 0: return 1 else: return count def calculate_pmi(self, schema): print("Calculating PMI for " + schema) corpus_count = 0 for item in self.__mongo_db.get(schema, {}): corpus_count += len(word_tokenize(item['text'])) print(corpus_count) print("Lemmatizing corpus.") lemmatized_text = [] for ind, text in enumerate(self.__mongo_db.get(schema, {})): doc = text['text'] for ch in ['›', '‹', '»', '«']: if ch in doc: doc = doc.replace(ch, '"') lemmatized_text += self.parser.lemmatize_chunk(doc) print("Part " + str(ind) + " lemmatized.") self.aggregate_occurences(schema, "subject", lemmatized_text) self.aggregate_occurences(schema, "object", lemmatized_text) self.aggregate_occurences(schema, "adjective", lemmatized_text) self.aggregate_occurences(schema, "verb", lemmatized_text) print("Finished aggregating occurences.") self.calculate_pmi_helper(schema, corpus_count, "subject_adjective_occ", "subject", "adjective") self.calculate_pmi_helper(schema, corpus_count, "subject_verb_occ", "subject", "verb") self.calculate_pmi_helper(schema, corpus_count, "subject_object_occ", "subject", "object") self.calculate_pmi_helper(schema, corpus_count, "object_verb_occ", "object", "verb") def aggregate_occurences(self, schema, word_table, lemmatized_text): table = self.__postgre_db.get_data_from_table(schema, word_table + "_occ") for item in table: word = item[word_table] split_word = word.split(" ") length = len(split_word) if length > 1: if length == 2: counter = list(bigrams(lemmatized_text)) word_tuple = (split_word[0], split_word[1]) elif length == 3: counter = list(trigrams(lemmatized_text)) word_tuple = (split_word[0], split_word[1], split_word[2]) else: counter = [] count = counter.count(word_tuple) else: word = item[word_table] count = self.aggregate_occurences_help(Counter(lemmatized_text), word) print(word, str(count)) self.__postgre_db.update(schema, word_table + "_occ", "count=" + str(count), "id=" + str(item['id'])) def calculate_pmi_helper(self, schema, corpus_count, co_occurence, word1, word2): co_occ_table = self.__postgre_db.get_data_from_table(schema, co_occurence) for item in co_occ_table: item_id = item['id'] co_occ_freq = float(item['count'] / corpus_count) word1_id = item[word1] word2_id = item[word2] word1_occ = self.__postgre_db.get(schema, word1 + "_occ", "id=" + str(word1_id), "count") word2_occ = self.__postgre_db.get(schema, word2 + "_occ", "id=" + str(word2_id), "count") pmi = log2(co_occ_freq / (float(word1_occ / corpus_count) * float(word2_occ / corpus_count))) self.__postgre_db.update(schema, co_occurence, "pmi=" + str(pmi), "id=" + str(item_id)) def calculate_pmi_use_case2(self, schema): print("Calculating PMI for " + schema) corpus_count = 0 text = [] for item in self.__mongo_db.get(schema, {}): text += word_tokenize(item['text'], language='german') corpus_count += len(word_tokenize(item['text'], language='german')) print(corpus_count) counter = Counter(text) single_pattern_table = self.__postgre_db.get_data_from_table(schema, "bscale_single_pattern") # counting single pattern occurrences for item in single_pattern_table: word = item['single_pattern'] count = counter[word] self.__postgre_db.update(schema, "bscale_single_pattern", "count=" + str(count), "single_pattern=" + add_quotes(word)) # pmi calculation co_occ_table = self.__postgre_db.get_data_from_table(schema, "correlating_pattern") for item in co_occ_table: item_id = item['id'] co_occ_freq = float(item['count'] / corpus_count) word1_id = item['pattern_a'] word2_id = item['pattern_b'] word1_occ = self.__postgre_db.get(schema, "bscale_single_pattern", "id=" + str(word1_id), "count") print(word1_occ) word2_occ = self.__postgre_db.get(schema, "bscale_single_pattern", "id=" + str(word2_id), "count") print(word2_occ) pmi = log2(co_occ_freq / (float(word1_occ / corpus_count) * float(word2_occ / corpus_count))) print(pmi) self.__postgre_db.update(schema, "correlating_pattern", "pmi=" + str(pmi), "id=" + str(item_id)) def get_results_use_case2(self, schema): print("Colour + Nature") pprint(self.__postgre_db.query( """SELECT C.pmi, S.single_pattern AS pattern_a, T.single_pattern AS pattern_b FROM """ + schema + """.correlating_pattern C, """ + schema + """.bscale_single_pattern S, """ + schema + """.bscale_single_pattern T WHERE (S.bscale_id = 1 AND T.bscale_id = 2 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) OR (S.bscale_id = 2 AND T.bscale_id = 1 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) ORDER BY pmi DESC""")) print("Colour + Location") pprint(self.__postgre_db.query( """SELECT C.pmi, S.single_pattern AS pattern_a, T.single_pattern AS pattern_b FROM """ + schema + """.correlating_pattern C, """ + schema + """.bscale_single_pattern S, """ + schema + """.bscale_single_pattern T WHERE (S.bscale_id = 1 AND T.bscale_id = 3 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) OR (S.bscale_id = 3 AND T.bscale_id = 1 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) ORDER BY pmi DESC""")) print("Colour + Social") pprint(self.__postgre_db.query( """SELECT C.pmi, S.single_pattern AS pattern_a, T.single_pattern AS pattern_b FROM """ + schema + """.correlating_pattern C, """ + schema + """.bscale_single_pattern S, """ + schema + """.bscale_single_pattern T WHERE (S.bscale_id = 1 AND T.bscale_id = 4 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) OR (S.bscale_id = 4 AND T.bscale_id = 1 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) ORDER BY pmi DESC""")) print("Nature + Location") pprint(self.__postgre_db.query( """SELECT C.pmi, S.single_pattern AS pattern_a, T.single_pattern AS pattern_b FROM """ + schema + """.correlating_pattern C, """ + schema + """.bscale_single_pattern S, """ + schema + """.bscale_single_pattern T WHERE (S.bscale_id = 2 AND T.bscale_id = 3 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) OR (S.bscale_id = 3 AND T.bscale_id = 2 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) ORDER BY pmi DESC""")) print("Nature + Social") pprint(self.__postgre_db.query( """SELECT C.pmi, S.single_pattern AS pattern_a, T.single_pattern AS pattern_b FROM """ + schema + """.correlating_pattern C, """ + schema + """.bscale_single_pattern S, """ + schema + """.bscale_single_pattern T WHERE (S.bscale_id = 2 AND T.bscale_id = 4 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) OR (S.bscale_id = 4 AND T.bscale_id = 2 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) ORDER BY pmi DESC""")) print("Location + Social") pprint(self.__postgre_db.query( """SELECT C.pmi, S.single_pattern AS pattern_a, T.single_pattern AS pattern_b FROM """ + schema + """.correlating_pattern C, """ + schema + """.bscale_single_pattern S, """ + schema + """.bscale_single_pattern T WHERE (S.bscale_id = 3 AND T.bscale_id = 4 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) OR (S.bscale_id = 4 AND T.bscale_id = 3 AND C.pattern_a = S.single_pattern_id AND C.pattern_b = T.single_pattern_id) ORDER BY pmi DESC""")) def check_pattern(self, pattern, token): """Strip token and check if the token matches the defined pattern. Parameter: pattern -- the pattern to search for token -- the token to match with the pattern """ split_token = re.split('\W+', token) if split_token[0] == '': split_token = split_token[1] else: split_token = split_token[0] return split_token == pattern def get_result(self, schema): print(self.__postgre_db.query("""SELECT SUM(SV.count) FROM """ + schema + """.subject_verb_occ SV""")) print(self.__postgre_db.query("""SELECT SUM(SV.count) FROM """ + schema + """.object_verb_occ SV""")) print(self.__postgre_db.query("""SELECT SUM(SV.count) FROM """ + schema + """.subject_object_occ SV""")) print(self.__postgre_db.query("""SELECT SUM(SV.count) FROM """ + schema + """.subject_adjective_occ SV""")) pprint(self.__postgre_db.query("""SELECT S.subject, V.verb, SV.pmi FROM """ + schema + """.subject_verb_occ SV, """ + schema + """.subject_occ S, """ + schema + """.verb_occ V WHERE SV.subject = S.id AND SV.verb = V.id ORDER BY subject DESC, pmi DESC""")) pprint(self.__postgre_db.query("""SELECT O.object, V.verb, OV.pmi FROM """ + schema + """.object_verb_occ OV, """ + schema + """.object_occ O, """ + schema + """.verb_occ V WHERE OV.object = O.id AND OV.verb = V.id ORDER BY object DESC, pmi DESC""")) pprint(self.__postgre_db.query("""SELECT O.object, S.subject, SO.pmi FROM """ + schema + """.subject_object_occ SO, """ + schema + """.subject_occ S, """ + schema + """.object_occ O WHERE SO.object = O.id AND SO.subject = S.id ORDER BY subject DESC, pmi DESC""")) pprint(self.__postgre_db.query("""SELECT S.subject, A.adjective, SA.pmi FROM """ + schema + """.subject_adjective_occ SA, """ + schema + """.subject_occ S, """ + schema + """.adjective_occ A WHERE SA.subject = S.id AND SA.adjective = A.id ORDER BY subject DESC, pmi DESC"""))
def tokenize(text): tknzr = WhitespaceTokenizer() tokens = tknzr.tokenize(text) # tokens = nltk.word_tokenize(text) return tokens
def word_parser( input_str ): tokenizer = WhitespaceTokenizer() return tokenizer.tokenize( input_str )
import nltk nltk.download() from nltk.tokenize import WhitespaceTokenizer, WordPunctTokenizer, TreebankWordTokenizer text = "this is a block of text. I am writing a piece to explain the use of nlp packages." text = 'Feet wolves talked cats' ######tokenize tokenizer1 = WhitespaceTokenizer() #extract based o white space tokenizer2 = WordPunctTokenizer( ) #extract based on the white space as well as punctuation tokenizer3 = TreebankWordTokenizer() tokens1 = tokenizer1.tokenize(text) tokens2 = tokenizer2.tokenize(text) tokens3 = tokenizer3.tokenize(text) ###### #best is first try to lemmetizing and then stem from nltk.stem import PorterStemmer, WordNetLemmatizer ps = PorterStemmer() lem = WordNetLemmatizer() lemmatized_tokens = [] for token in tokens3: lemmatized_tokens.append(lem.lemmatize(token)) #lemmatized and stemmed
# but now works ic_dict = {} cong = [] all_tokens = 0 #create IC dict #tokenizer = RegexpTokenizer(r'\w+') tokenizer = WhitespaceTokenizer() filename = "../Subtlex.US.txt" for line in open(filename,"r").readlines(): line = line.lower() line = line.strip() #line = line.replace("-"," ") #line = "self-support" line = ' '.join(word.strip(string.punctuation) for word in line.split()) print tokenizer.tokenize(line) t_list = tokenizer.tokenize(line) for token in t_list: try: token = token.encode("ascii", "ignore").lower() #token = unicode(token, 'utf8') #token = token.encode('utf8') try: ic_dict[token] ic_dict[token]+=1 except: ic_dict[token] = 1 all_tokens+=1
SymmetricProximateTokensTemplate(ProximateWordsRule, (1,3)), ProximateTokensTemplate(ProximateTagsRule, (-1, -1), (1,1)), ProximateTokensTemplate(ProximateWordsRule, (-1, -1), (1,1)), ] tagged_sentences=[] tokenizer =WhitespaceTokenizer() with open("datascience_6.txt","r") as openfile: for line in openfile: words = line.lower().strip() words=re.sub(r'\~|\`|\@|\$|\%|\^|\&|\*|\(|\)|\_|\=|\{|\[|\}|\]|\\|\<|\,|\<|\>|\?|\/|\;|\:|\"|\'', '',words) words=words.split('\r') jobposts = [s.lstrip() for s in words] for jobpost in jobposts: sentences=jobpost.split('.') for sentence in sentences: tokenized_sentence=tokenizer.tokenize(sentence) initial_tagged_sentence=nltk.pos_tag(tokenized_sentence) tagged_sentences.append(initial_tagged_sentence) tagged_no_empties = [] a =[] for i in tagged_sentences: if a==i: pass else: tagged_no_empties.append(i) unigram_tagger=nltk.UnigramTagger(tagged_no_empties) trainer = FastBrillTaggerTrainer(initial_tagger=unigram_tagger, templates=templates, trace=3,deterministic=True) brill_tagger = trainer.train(tagged_sentences, max_rules=10)
BiGramTriGram = Counter() TriGramTriGram = Counter() tokenizer = WhitespaceTokenizer() for line in openfile: words = line.lower().strip().replace('(',',').replace(')',',') words=re.sub(r'\~|\`|\@|\$|\%|\^|\&|\*|\(|\)|\_|\=|\{|\[|\}|\]|\\|\<|\,|\<|\.|\>|\?|\/|\;|\:|\"|\'', '',words) words = pattern.sub('', words) words=words.split('\r') words = [s.lstrip() for s in words] ReservoirALL={} for word in words: CountWordGrams = Counter() CountBiGrams = Counter() CountTriGrams = Counter() wordsplit= tokenizer.tokenize(word) wordsplit = [s.lstrip() for s in wordsplit] NoDupes = list(set(wordsplit)) TuplesNoDupes=[tuple(i.split()) for i in NoDupes] skillsonly=[x for x in TuplesNoDupes if x in SKILLS] skillsclean = [token for token in skillsonly if token not in Stopwords] BiGrams=bigrams(wordsplit) NoDupesBiGrams = list(set(BiGrams)) BiGrams=[x for x in NoDupesBiGrams if x in SKILLS] TriGrams=trigrams(wordsplit) NoDupesTriGrams = list(set(TriGrams)) TriGrams=[x for x in NoDupesTriGrams if x in SKILLS] CountWordGrams.update(skillsclean) CountBiGrams.update(BiGrams)