def expandSet(kwd_set, root_elt): ''' Expands a given set of keywords using the whole text and co-occurance probabilities @param kwd_set: Set<string>. List of mentioned kwds @param root_elt: etree.Element. The root element of the document ''' lines = [elt.text for elt in root_elt.findall(".//line")] stop_words = set(stopwords.words("english")) tokenizer = PunktWordTokenizer() all_pairs = [] for line in lines: for kwd in kwd_set: if re.match(kwd, line): tokens = filter(lambda x: x not in stop_words and x not in string.punctuation, tokenizer.tokenize(line)) for token in tokens: all_pairs.append((kwd, token)) top_pairs = [pair for pair, freq in Counter(all_pairs).iteritems() if freq >= 2] for pair in top_pairs: if KeywordExpander.verbose and pair[1] not in kwd_set: print "Expanding kwd with : ", pair[1] kwd_set.add(pair[1]); return kwd_set
def tokenize_title(text): text = re.sub('[.]','',text) text = PunktWordTokenizer().tokenize(text) for f in stopwords.words('english'): if f in text: text.remove(f) return text
def usingTitleAlgorithm(userinput): #nltk's english stopword list stop = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few','more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 'can', 'will', 'just', 'don', 'should', 'now'] url = str(userinput) title, htmlText = web_crawler(url) qry = PunktWordTokenizer().tokenize(title) #tokenize title qry = [words for words in qry if words.lower() not in stop] #Run Query through stopwords totalText = PunktSentenceTokenizer().tokenize(htmlText) textList = [] for i in totalText: i = PunktWordTokenizer().tokenize(i.strip('.')) textList.append(i) sentenceRanks = summarizer(qry, textList) finalResults = [] for num in range(1,4): #skipping the first sentence because it's just going to be the title ind = sentenceRanks[num][0] finalResults.append(' '.join(textList[ind])) finalResults.append(title) return finalResults
def __init__(self, tok=False, wtok=False, stok=False, pos=False, stem=False, pos_model=None, abbrev_set=None, stok_model=None): """@param abbrev_set: a set of frequent abbreviations.""" if tok: wtok = True stok = True if wtok: self.wordTokenizer = PunktWordTokenizer() #self.punktSplitter = re.compile(r"^([\w\d]+)([.?,:;!])$", re.UNICODE) self.punktSplitter = re.compile(r"^(.+)([.?,:;!])$", re.UNICODE) # Bragantino,2006.In fix this shit if stok: if stok_model is not None: try: self.senTokenizer = stok_model except LookupError: sys.stderr.write("WARNING: tokenizer cannot be loaded") sys.stderr.write( "WARNING: using an untrained sen_tokenizer") self.senTokenizer = PunktSentenceTokenizer() else: try: self.senTokenizer = nltk.data.load( 'tokenizers/punkt/english.pickle') except LookupError: sys.stderr.write( "WARNING: english tokenizer cannot be loaded, nltk.data does not contain in!" ) sys.stderr.write( "WARNING: using an untrained sen_tokenizer") self.senTokenizer = PunktSentenceTokenizer() self.abbrev_set = (set(abbrev_set) if abbrev_set is not None else set()) if pos: if pos_model is not None: self.posTagger = HunposTagger(pos_model, encoding="utf-8") else: self.posTagger = HunposTagger(os.path.join( os.environ['HUNPOS'], 'english.model'), encoding="utf-8") if stem: self.stemmer = WordNetLemmatizer()
def get_indices(): text = """What the f**k did you just f*****g say about me, you little bitch? I'll have you know I graduated top of my class in the military, and I've been involved in numerous secret raids on enemies, and I have over 300 confirmed kills. I am trained in gorilla warfare and I'm the top sniper in the entire US army. You are nothing to me but just another target. I will wipe you the f**k out with precision the likes of which has never been seen before on this Earth, mark my f*****g words. You think you can get away with saying that shit to me over the Internet? Think again, f****r. As we speak I am contacting my secret network of spies across the USA and your IP is being traced right now so you better prepare for the storm, maggot. The storm that wipes out the pathetic little thing you call your life. You're f*****g dead, kid. I can be anywhere, anytime, and I can kill you in over seven hundred ways, and that's just with my bare hands. Not only am I extensively trained in unarmed combat, but I have access to the entire arsenal of the United States marines and I will use it to its full extent to wipe your miserable ass off the face of the continent, you little shit. If only you could have known what unholy retribution your little "clever" comment was about to bring down upon you, maybe you would have held your f*****g tongue. But you couldn't, you didn't, and now you're paying the price, you goddamn idiot. I will shit fury all over you and you will drown in it. You're f*****g dead, kid.""" # text = "Hi I like call of duty." text_tokens = PunktWordTokenizer().tokenize(text) print(len(text_tokens)) print(text_tokens.index("clever")) result = [] for ii, word in enumerate(text_tokens): answer = raw_input("Replace '{0}'? (y/n): ".format(word)) if answer == 'y': result.append(ii) print(result)
def Document2Word2VecTrainingInputFormat(document): """ Given an input string of plain text sentences, first tokenizes the documents into each sentence, then tokenizes each sentence at the word level. Returns a list of lists where each inner lists represents a sentence in the input and the contents are the individual words of the sentence. """ output = list() sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') word_detector = PunktWordTokenizer() sentences = sent_detector.tokenize(document) for sent in sentences: output.append(word_detector.tokenize(sent)) return output
def worker(args): sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') word_detector = PunktWordTokenizer() def split_sentences(txt): sents = sent_detector.tokenize(txt.strip(), realign_boundaries=True) for sent in sents: sstr = u' '.join(word for word in word_detector.tokenize(sent)) tkns = filter(None, sstr.split(u' ')) if len(tkns) > 0: yield u' '.join(tkns) chunk, ofile = args with codecs.open(ofile, 'w', 'utf-8') as of: for path in chunk: with gzip.open(path) as f: for txt in find_story(f): if txt is None: continue else: for sent in split_sentences(txt): of.write(sent) of.write(u'\n') of.flush() print 'Completed', path
class Translator: """docstring for Translator""" def __init__(self, *dictionaries): self.tokenizer = PunktWordTokenizer() self.dictionaries = dictionaries def translate(self, sentence): tokens = self.tokenizer.tokenize(sentence) def select_value(l): '''Should select the corect value''' #TODO: Implement this, right now has default behavior if isinstance(l, list): return l[0] else: return l def tr(word): for d in self.dictionaries: found = d[word] if found is not None: return found else: return word return [select_value(tr(w)) for w in tokens]
def __init__(self, sentences_file, stopwords): self.dictionary = None self.corpus = None f_sentences = codecs.open(sentences_file, encoding='utf-8') documents = list() count = 0 print "Gathering sentences and removing stopwords" for line in f_sentences: line = re.sub('<[A-Z]+>[^<]+</[A-Z]+>', '', line) # remove stop words and tokenize document = [ word for word in PunktWordTokenizer().tokenize(line.lower()) if word not in stopwords ] documents.append(document) count += 1 if count % 10000 == 0: sys.stdout.write(".") f_sentences.close() self.dictionary = corpora.Dictionary(documents) self.corpus = [self.dictionary.doc2bow(text) for text in documents] self.tf_idf_model = TfidfModel(self.corpus) print len(documents), "documents red" print len(self.dictionary), " unique tokens"
def __init__(self, tok=False, wtok=False, stok=False, pos=False, stem=False, pos_model=None, abbrev_set=None): """@param abbrev_set: a set of frequent abbreviations.""" if tok: wtok = True stok = True if wtok: self.wordTokenizer = PunktWordTokenizer() #self.punktSplitter = re.compile(r"^([\w\d]+)([.?,:;!])$", re.UNICODE) self.punktSplitter = re.compile(r"^(.+)([.?,:;!])$", re.UNICODE) # Bragantino,2006.In fix this shit if stok: try: self.senTokenizer = nltk.data.load('tokenizers/punkt/english.pickle') except LookupError: sys.stderr.write("WARNING: english tokenizer cannot be loaded, nltk.data does not contain in!") sys.stderr.write("WARNING: using an untrained sen_tokenizer") self.senTokenizer = PunktSentenceTokenizer() self.abbrev_set = (set(abbrev_set) if abbrev_set is not None else set()) if pos: if pos_model is not None: self.posTagger = HunposTagger(pos_model, encoding="utf-8") else: self.posTagger = HunposTagger(os.path.join(os.environ['HUNPOS'], 'english.model'), encoding="utf-8") if stem: self.stemmer = WordNetLemmatizer()
def get_non_nouns(sentence): s1 = PunktWordTokenizer().tokenize(sentence) s2 = nltk.pos_tag(s1) useful = ["JJ", "JJS", "JJS", "RB", "RBR", "RBS"] splitted = [] # print s2 for pairs in s2: if (pairs[1] in useful): splitted.append(pairs[0]) return splitted sentence = sentence.split() lmtzr = WordNetLemmatizer() for word in sentence: word = lmtzr.lemmatize(word) power = {} for aspect in feature_list: power[aspect] = compute_for_aspect(aspect, sentence, ad_words) m_probability = 0 m_aspect = "general" for aspect in feature_list: if power[aspect][0] > m_probability: m_aspect = aspect m_probability = power[aspect][0] print(m_aspect + "\n") return m_aspect
def worker(args): sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') word_detector = PunktWordTokenizer() def split_sentences(txt): sents = sent_detector.tokenize(txt.strip(), realign_boundaries=True) for sent in sents: sstr = u' '.join(word for word in word_detector.tokenize(sent)) tkns = filter(None, sstr.split(u' ')) if len(tkns) > 0: yield u' '.join(tkns) tdir, txt_files, ofile = args with codecs.open(ofile, 'w', 'utf-8') as of: for fname in txt_files: txt_file = os.path.join(tdir, fname) with codecs.open(txt_file, 'r', 'utf-8') as f: text = u' '.join(f.readlines()) for sent in split_sentences(text): of.write(sent) of.write(u'\n') of.flush() print 'Completed', txt_file
def __init__(self, sentences_file, stopwords): self.dictionary = None self.corpus = None f_sentences = codecs.open(sentences_file, encoding='utf-8') documents = list() print "Gathering sentences and removing stopwords" for line in f_sentences: line = re.sub('<[A-Z]+>[^<]+</[A-Z]+>', '', line) # TODO: remove punctuation, commas, etc. # remove common words and tokenize document = [word for word in PunktWordTokenizer().tokenize(line.lower()) if word not in stopwords] documents.append(document) # TODO: avoid keeping all documents in memory #dictionary = corpora.Dictionary(line.lower().split() for line in open('mycorpus.txt')) f_sentences.close() """ print "Removing tokens that appear only once" # remove words that appear only once # TODO: ver qual eh a frequencia de corte no word2vec, e fazer o mesmo all_tokens = sum(documents, []) tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1) documents = [[word for word in text if word not in tokens_once] for text in documents] """ self.dictionary = corpora.Dictionary(documents) self.corpus = [self.dictionary.doc2bow(text) for text in documents] self.tf_idf_model = TfidfModel(self.corpus) print len(documents), "documents red" print len(self.dictionary), " unique tokens"
def parse_ap(input_path, output_path): from nltk.corpus import stopwords stop = stopwords.words('english') from nltk.tokenize.punkt import PunktWordTokenizer tokenizer = PunktWordTokenizer() from nltk.stem.porter import PorterStemmer stemmer = PorterStemmer() from string import ascii_lowercase doc_title = "" doc_content = [] doc_count = 0 input_file_stream = open(input_path, 'r') output_file_stream = open(output_path, 'w') for line in input_file_stream: line = line.strip().lower() if line == "<text>" or line == "</text>": continue if line == "<doc>": continue if line == "</doc>": output_file_stream.write("%s\t%s\n" % (doc_title, " ".join(doc_content))) doc_count += 1 if doc_count % 1000 == 0: print("successfully parsed %d documents" % (doc_count)) continue if line.startswith("<docno>"): line = line.lstrip("<docno>") line = line.rstrip("</docno>") doc_title = line.strip() continue #doc_content = [stemmer.stem(x) for x in tokenizer.tokenize(line) if (min(y in ascii_lowercase for y in x))]; doc_tokens = [ x for x in tokenizer.tokenize(line) if (min(y in ascii_lowercase for y in x)) ] doc_content = [stemmer.stem(x) for x in doc_tokens if x not in stop]
def parse_ap(input_path, output_path): from nltk.corpus import stopwords stop = stopwords.words('english') from nltk.tokenize.punkt import PunktWordTokenizer tokenizer = PunktWordTokenizer() from nltk.stem.porter import PorterStemmer stemmer = PorterStemmer(); from string import ascii_lowercase doc_title = ""; doc_content = []; doc_count = 0; input_file_stream = open(input_path, 'r'); output_file_stream = open(output_path, 'w'); for line in input_file_stream: line = line.strip().lower(); if line=="<text>" or line=="</text>": continue; if line=="<doc>": continue; if line=="</doc>": output_file_stream.write("%s\t%s\n" % (doc_title, " ".join(doc_content))); doc_count += 1; if doc_count%1000==0: print("successfully parsed %d documents" % (doc_count)); continue; if line.startswith("<docno>"): line = line.lstrip("<docno>"); line = line.rstrip("</docno>"); doc_title = line.strip(); continue; #doc_content = [stemmer.stem(x) for x in tokenizer.tokenize(line) if (min(y in ascii_lowercase for y in x))]; doc_tokens = [x for x in tokenizer.tokenize(line) if (min(y in ascii_lowercase for y in x))]; doc_content = [stemmer.stem(x) for x in doc_tokens if x not in stop];
def SentenceBreak(self, paragraphs): output = [] for paragraph in paragraphs: wb_para = ' '.join(PunktWordTokenizer().tokenize( paragraph.strip())) if self.sbreak_model: output.extend(self.sbreak_model.tokenize(wb_para)) else: output.append(wb_para) return output
def get_non_nouns(sentence): s1 = PunktWordTokenizer().tokenize(sentence) s2 = nltk.pos_tag(s1) useful = ["JJ", "JJS", "JJS", "RB", "RBR", "RBS"] splitted = [] # print s2 for pairs in s2: if (pairs[1] in useful): splitted.append(pairs[0]) return splitted
def remove_unnecessary(sentence): s1 = PunktWordTokenizer().tokenize(sentence) s2 = nltk.pos_tag(s1) useful = ["JJ","JJS","JJS","NN","NNS","NNP","NNPS","RB","RBR","RBS"] splitted = [] # print s2 for pairs in s2: if(pairs[1] in useful): splitted.append(pairs[0]) return splitted
def usingSentenceIntersectionAlgorithm(userinput): url = userinput title, htmlText = web_crawler(url) qry = PunktWordTokenizer().tokenize(title) #tokenize title totalText = PunktSentenceTokenizer().tokenize(htmlText) textList = [] for i in totalText: i = PunktWordTokenizer().tokenize(i.strip('.')) textList.append(i) ranks = sentence_ranks(textList) ranks_sorted = sorted(ranks.items(), key = operator.itemgetter(1)) ranks_sorted.reverse() finalResults = [] for num in range(1,4): #skipping the first sentence because it's just going to be the title finalResults.append(ranks_sorted[num][0]) finalResults.append(title) return finalResults
def tokenize_document(doc, is_ted, is_only_nouns): """ For a given string text, the script get the text's tokens. The text is pre-processd and filtered, after that the NLTK tokenizer process is carried out, if a flag is enabled, the tokens are tagged and filtered out only the nouns and finally the tokens are lemmatized. PARAMETERS: 1. doc: The string text from which extract the tokens 2. is_ted: A flag to say if to add to the english standard stopword the custom stopwords prepared for the TED talks corpus 3. is_only_nouns: A flag to say if extract only the tokens tagged as a nouns RETURNS: A list of strings where each string is a token from the given text """ res = [] try: # First pre-process and filter the given text doc2= remove_punctuation_stopwords(doc, is_ted) # From the pre-proccesed and filtered text apply the NLTK tokenizer process tokens = PunktWordTokenizer().tokenize(' '.join(doc2)) # If enabled the flag, then only extract the tokens tagged as a nouns if is_only_nouns: tagged_tokens = nltk.pos_tag(tokens) tokens = [] for token, tag in tagged_tokens: if (tag == 'NN') or (tag == 'NNP') or (tag == 'NNS'): tokens.append(token) # Lemmatize the tokens using the NLTK lemmatizer for i in range(0,len(tokens)): lema = WordNetLemmatizer().lemmatize(tokens[i]) # If the token was not lemmatized, then apply verb lemmatization if lema == tokens[i]: lema = WordNetLemmatizer().lemmatize(tokens[i], 'v') if (len(lema) > 1) and (not lema.isdigit()): # Append the lema to the result to be returned res.append(lema) except: print "tokenize_document" print "" traceback.print_exc() return res
def stringTokenizer(s): try: punct = set(string.punctuation) s = ''.join(x for x in s if x not in punct) s = s.lower() s = unicode(s) print "tokenizing query" tokens = PunktWordTokenizer().tokenize(s) tokens = [w for w in tokens if not w in stopset] return tokens except: print "keyword tokenizing error"
def test_latin_stopwords(self): """Filter Latin stopwords""" from cltk.stop.classical_latin.stops import STOPS_LIST sentence = 'Quo usque tandem abutere, Catilina, patientia nostra?' lowered = sentence.lower() tokens = PunktWordTokenizer().tokenize(lowered) no_stops = [w for w in tokens if w not in STOPS_LIST] target_list = [ 'usque', 'tandem', 'abutere', ',', 'catilina', ',', 'patientia', 'nostra', '?' ] self.assertEqual(no_stops, target_list)
def neutralRemover(message): # Analyze the sentiment of a bite by comparing it to an array of "positively" # and "negatively" oriented words. buy, sell = 0, 0 #Tokenize the message into individual words tokenizer = PunktWordTokenizer() #Assign a bullish or bearish sentiment to each word for word in tokenizer.tokenize(message): if word in pos: buy += 1 if word in neg: sell += 1 #Compare total bullish sentiment to total bearish sentiment if buy > sell: return 1 if buy < sell: return -1 return 0
def main(): query_file = sys.argv[1] doc_file = sys.argv[2] new_file = output_name(doc_file) query = read_file(query_file).translate(None, string.punctuation) query_words = PunktWordTokenizer().tokenize(query) lookup = {} # Construct dictionary from words to synonyms in query for word in query_words: word = str.lower(word) if word in lookup: break synonyms = [ lemma.name for lemma in sum([ss.lemmas for ss in wordnet.synsets(word)], []) ] lookup[word] = synonyms output = "" with open(doc_file) as f: # traverse the document looking to replace synonyms of words in the query for line in f: text = line.translate(None, string.punctuation) text_words = PunktWordTokenizer().tokenize(text) for (i, word) in enumerate(text_words): for key in lookup: if str.lower(word) in lookup[key]: text_words[i] = key break output += " ".join(text_words) + "\n" if os.path.exists(new_file): os.remove(new_file) with open(new_file, 'w') as f: f.write(output) print 'done'
def parser(): words=None searchterm=request.vars.query type=request.vars.type #null searches if not(request.vars): return dict(wordlist=True, words=words) elif not(searchterm): redirect (URL(r=request,c="language",f="dictionary")) elif (' ' in searchterm): pass else: redirect (URL(r=request,c="language",f="dictionary",vars={'query':searchterm, 'type':type})) typequery=request.vars.type ### add reference to example sentences #wordlist=searchterm.split() #for word in wordlist: # pass if typequery=="English": query= dblanguage.BundjalungExamples.English.like('%%%s%%' % searchterm) else: query= dblanguage.BundjalungExamples.Language.like('%%%s%%' % searchterm) words=dblanguage(query) try: words=words.select() except: redirect (URL(r=request,c="language",f="dictionary",vars={'query':searchterm, 'type':type})) ##else load dictionary wd = dictionary.AboriginalLanguageDictionary() ws = stemmer.AboriginalLanguageStemmer() if (words): return dict(wordlist=False, words=words, query=searchterm) else: newwords = PunktWordTokenizer().tokenize(searchterm) words = [] for word in newwords: words+= [translate_word(word, typequery, ws, wd)] lang=[] english=[] pos=[] for word in words: printed_word = print_word(word) lang.append(printed_word[0]) english.append(printed_word[1]) pos.append(printed_word[2]) words=[lang,english,pos] return dict(wordlist=True, words=words, query=request.vars.query)
def overlapcontext(synset, sentence): gloss = set(PunktWordTokenizer().tokenize(synset.definition())) for i in synset.examples(): gloss.union(i) gloss = gloss.difference(functionwords) if isinstance(sentence, str): sentence = set(sentence.split(" ")) elif isinstance(sentence, list): sentence = set(sentence) elif isinstance(sentence, set): pass else: return sentence = sentence.difference(functionwords) return len(gloss.intersection(sentence))
def test_greek_stopwords(self): """Filter Greek stopwords""" from cltk.stop.classical_greek.stops_unicode import STOPS_LIST sentence = """Ἅρπαγος δὲ καταστρεψάμενος Ἰωνίην ἐποιέετο στρατηίην ἐπὶ Κᾶρας καὶ Καυνίους καὶ Λυκίους, ἅμα ἀγόμενος καὶ Ἴωνας καὶ Αἰολέας.""" lowered = sentence.lower() tokens = PunktWordTokenizer().tokenize(lowered) no_stops = [w for w in tokens if w not in STOPS_LIST] target_list = [ 'ἅρπαγος', 'καταστρεψάμενος', 'ἰωνίην', 'ἐποιέετο', 'στρατηίην', 'κᾶρας', 'καυνίους', 'λυκίους', ',', 'ἅμα', 'ἀγόμενος', 'ἴωνας', 'αἰολέας.' ] self.assertEqual(no_stops, target_list)
def split(self, sep=None, maxsplit=-1): if not sep and maxsplit == -1: tokens = PunktWordTokenizer().tokenize(self) else: tokens = str.split(self, sep, maxsplit) result = [] for (index, token) in enumerate(tokens): if self.iob_out(): iob_tag = self.OUT elif self.iob_begins() and index == 0: iob_tag = self.BEGINS else: iob_tag = self.IN result.append(self.__class__(token, iob_tag=iob_tag)) return result
def contentTokenizer(url): try: response = mechanize.urlopen(url) soup = BeautifulSoup(response.read()) s = soup.findAll(text=True) print "tokenizing : " + url punct = set(string.punctuation) s = ''.join(x for x in s if x not in punct) s = s.lower() s = unicode(s) tokens = PunktWordTokenizer().tokenize(s) tokens = [w for w in tokens if not w in stopset] return tokens except: print "url tokenizing error"
def processDataFile(): file_noun_list = [] data_file_address = "data_file_trimmed.txt" file_in = open(data_file_address, 'r') for line in file_in: s1 = PunktWordTokenizer().tokenize(line) s2 = nltk.pos_tag(s1) noun_list = getNouns(s2) for noun in noun_list: lmtzr = WordNetLemmatizer() word = lmtzr.lemmatize(noun) file_noun_list.append(word) return file_noun_list
def create_dictionary(self): """ This function creates the dictionary""" if self.input_dir is not False: assert ("No Input dir") if self.save_file is not False: assert ("No Save File") if self.files_list is not False: assert ("No files list file") wnl = WordNetLemmatizer() pt = PunktWordTokenizer() files = os.listdir(self.input_dir) token_dicts = {} article_files = [] for f in files: with open(os.path.join(self.input_dir, f), "r") as g: #tokens = pt.tokenize(g.read()) #stemmed = [wnl.lemmatize(token) for token in tokens] #stemmed_nostop = [w for w in stemmed if w not in stopwords.words('english')] # Now let's hold all of the data in lists token_dicts[f] = g.read() #article_files.append(f) # Now let's initialize the tfIDF vectorizer print "Starting Vectorizer" tfidf = TfidfVectorizer(max_features=1000) matrix_val = tfidf.fit_transform(token_dicts.values()) print type(matrix_val) mat_file = self.save_file + ".npy" self.matrix_val = matrix_val # Now write out this object pickle.dump(tfidf, open(self.save_file, "w")) # File lists with open(self.files_list, "w") as f: f.write("\n".join(token_dicts.keys())) self.file_names = token_dicts.keys() pass
def read_file(file_object): lines = file_object.readlines() for line in lines: print "#######LINE#######" print line text = PunktWordTokenizer().tokenize(line) #text = nltk.wordpunct_tokenize(line) print "#######TEXT#######" print text """ STOP WORD """ stopwords = nltk.corpus.stopwords.words('english') content = [w for w in text if w[0].lower() not in stopwords] print "#######STOP WORD#######" print content """ POS TAGGING """ tagged_sent = nltk.pos_tag(content) tagged_sent = [(word, simplify_wsj_tag(tag)) for word, tag in tagged_sent] print "#######POS#######" print tagged_sent """ STEMMING """ #tagged_sent = tuple(tagged_sent) stemmer = SnowballStemmer("english", ignore_stopwords=True) stem_word = "" for wrd in tagged_sent: stem_word = stem_word + " " + stemmer.stem(wrd[0]) print "#######STEMMING#######" print stem_word """ LEMMATIZING """ print tagged_sent lmtzr = WordNetLemmatizer() sent = "" for wrd in tagged_sent: sent = sent + " " + lmtzr.lemmatize(wrd[0]) print "#######LEMMA" """""" print sent
def sentenceSentiment(option): #option = 0 or 1 from nltk.tokenize.punkt import PunktWordTokenizer urlneg = 'http://www.unc.edu/~ncaren/haphazard/negative.txt' urlpos = 'http://www.unc.edu/~ncaren/haphazard/positive.txt' urllib.urlretrieve(urlneg,dirWordLists + os.sep + 'negative.txt') urllib.urlretrieve(urlpos,dirWordLists + os.sep + 'positive.txt') neg_list = open(dirWordLists + os.sep + 'negative.txt').read() pos_list = open(dirWordLists + os.sep + 'positive.txt').read() neg_list = sorted(list(set(neg_list.split('\n')))) neg_list = filter(None, neg_list) pos_list = sorted(list(set(pos_list.split('\n')))) pos_list = filter(None, pos_list) sent_Dict_sentiment = sentExtract('trainingset') sent_Dict = sentExtract('trainingset') setLocation = pathToSets + os.sep + 'trainingset' file_list = os.listdir(setLocation) file_list.remove('.DS_Store') file_list.sort() conceptVectors = getConceptVectors() for i in file_list: for j in conceptVectors: for n in list(range(len(sent_Dict[i][j]))): pos_count = 0 neg_count = 0 words = PunktWordTokenizer().tokenize(sent_Dict[i][j][n]) for word in words: if word in pos_list: pos_count+=1 elif word in neg_list: neg_count+=1 sent_Dict_sentiment[i][j][n] = (pos_count, neg_count) if option == 0: #calculate overall positive and negative scores for documents return(sent_Dict_sentiment) elif option == 1: #calculate positive and negative separately for each document sent_Dict_Sum_sentiment = {} for x in file_list: total_pos = 0 total_neg = 0 for y in conceptVectors: for z in list(range(len(sent_Dict_sentiment[x][y]))): total_pos += sent_Dict_sentiment[x][y][z][0] total_neg += sent_Dict_sentiment[x][y][z][1] sent_Dict_Sum_sentiment[x] = [total_pos,total_neg] return(sent_Dict_Sum_sentiment)
def tokenize(text): text = ' '.join(text) tokens = PunktWordTokenizer().tokenize(text) # lemmatize words. try both noun and verb lemmatizations lmtzr = WordNetLemmatizer() for i in range(0, len(tokens)): if lang != 'english': if tokens[i] in lang_dictionary: tokens[i] = lang_dictionary[tokens[i]] else: res = lmtzr.lemmatize(tokens[i]) if res == tokens[i]: tokens[i] = lmtzr.lemmatize(tokens[i], 'v') else: tokens[i] = res # don't return any single letters tokens = [t for t in tokens if len(t) > 1 and not t.isdigit()] return tokens
def worker(args): sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') word_detector = PunktWordTokenizer() def split_sentences(txt): sents = sent_detector.tokenize(txt.strip(), realign_boundaries=True) tokenizer = RegexpTokenizer(r'\w+') for sent in sents: tkns = tokenizer.tokenize(sent) #in word_detector.tokenize(sent)) #tkns = filter(None, sstr.split(u' ')) if len(tkns) > 0: yield u' '.join(tkns) tdir, txt_files, ofile, dpl = args nfiles = len(txt_files) with codecs.open(ofile, 'w', 'utf-8') as of: for i, fname in enumerate(txt_files, 1): txt_file = os.path.join(tdir, fname) with codecs.open(txt_file, 'r', 'utf-8') as f: text = u' '.join(f.readlines()) if dpl: doc_str = u' '.join(sent for sent in split_sentences(text)) of.write(doc_str) of.write(u'\n') of.flush() else: for sent in split_sentences(text): of.write(sent) of.write(u'\n') of.flush() print u'{}/{}) Completed {} --> {}'.format(i, nfiles, txt_file, ofile).encode('utf-8')
def get_stock_info(): sents = get_sentences() found_stocks = [] stock_data = {} for sent in sents: words = PunktWordTokenizer().tokenize(sent) filt_words = [ w for w in words if not (w.lower() in stopwords.words('english') or w in punc) ] found = stock_in_senctence(filt_words) #for every stock found append list of words if len(found) > 0: for s in found: if not s in stock_data: stock_data[s] = [] stock_data[s].extend(filt_words) return stock_data
def preprocess_txt(self, text, convertlower=True, nopunk=True, stopwords=True, lemmatize_doc=True, lemmatize_pos=True, stemmed=False): #convert to lower if convertlower: text = text.lower() # remove punctuation if nopunk: text = self.remove_punct(text) #tokenize text tokens = PunktWordTokenizer().tokenize(text) #remove extra whitespaces tokens = [token.strip() for token in tokens] if stopwords: tokens = self.removestopwords(tokens) #lemmatize if lemmatize_doc: tokens = self.lemmatize(tokens,lemmatize_pos) #stem if stemmed: porter = PorterStemmer() tokens = [ porter.stem(token) for token in tokens ] return tokens
def replace_indices(self, indices, text): """Replace the indices of the tokenized text""" text_tokens = PunktWordTokenizer().tokenize(text) pos_tags = nltk.pos_tag(text_tokens) print(len(pos_tags)) print(pos_tags) print(len(text_tokens)) # result = [token.replace(filter(str.isalnum, token), )] for index in indices: try: stripped_word = filter(str.isalnum, text_tokens[index]) text_tokens[index] = text_tokens[index].replace(stripped_word, self._old_theme.replace(self._new_theme,stripped_word, pos_tags[index][1]), ) except IndexError as detail: print("Index error with {}, {}".format(text_tokens[index], detail)) continue return " ".join(text_tokens)
def _tokenize(self): tok=PunktWordTokenizer() #tok=TreebankWordTokenizer() split_whitespace=lambda: re.compile(r'(\s+)').split(re.sub(u"\."," .",self.text)) return list(chain(*[s if s.isspace() else tok.tokenize(s) for s in split_whitespace()]))
parser.add_argument('--verbose', '-v', action='store_true') parser.add_argument('--speak', '-s', action='store_true') args = parser.parse_args() logging_level = logging.DEBUG if args.verbose else logging.INFO logging.basicConfig(level=logging_level) if args.model: logging.debug('loading model...') hmm = load_model(args.model) if args.corpus: logging.debug('loading corpus...') corpus = open(args.corpus, 'rb').read() sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') word_detector = PunktWordTokenizer() sentences = sent_detector.tokenize(corpus.strip()) words = [cleanup_words(word_detector.tokenize(s)) for s in sentences] logging.debug('training model...') trainer = nltk.tag.hmm.HiddenMarkovModelTrainer(states=range(8), symbols=symbols(words)) hmm = trainer.train_unsupervised(sequences(words), max_iterations=5) logging.debug('saving model...') save_model(args.corpus + '.hmm', hmm) logging.debug('sampling model...') while(True): utterance = sample(hmm, random.randint(5, 15)) + '.' print utterance
from __future__ import print_function import sys import re import random from nltk.tokenize.punkt import PunktWordTokenizer from config import * from process_clusters import get_another_word_in_cluster tkn = PunktWordTokenizer() if len(sys.argv) <= 1: sys.stderr.write('Please specify the file you want to Twitterize\n') sys.exit() with open(sys.argv[1], 'r') as f: # hacky way of keeping track of whether we're in a quote or anchor tag # I assume it just toggles, no nesting in_open_quote = False in_open_anchor_tag = False previous_word_translated = False for line in f: line = line.replace(' ', ' ')\ .replace('>', '> ').replace('<', ' <')\ .replace(START_QUOTE, 'START_QUOTE ')\ .replace(END_QUOTE, ' END_QUOTE')\ .replace('.', ' .')
def __init__(self): self.tokenizer = PunktWordTokenizer()
def __init__(self, *dictionaries): self.tokenizer = PunktWordTokenizer() self.dictionaries = dictionaries
class NltkTools: _abbrevPattern = re.compile(r"([\w][\w]?[.]){2,}$", re.UNICODE) _datePattern = re.compile(r"(^|\s)(?:[\d]{2}){1,2}[.]$", re.UNICODE) _cleanerPattern = re.compile("(\w\w)([.?,:;!])(\w)(\w)", re.UNICODE) def __init__(self, tok=False, wtok=False, stok=False, pos=False, stem=False, pos_model=None, abbrev_set=None): """@param abbrev_set: a set of frequent abbreviations.""" if tok: wtok = True stok = True if wtok: self.wordTokenizer = PunktWordTokenizer() #self.punktSplitter = re.compile(r"^([\w\d]+)([.?,:;!])$", re.UNICODE) self.punktSplitter = re.compile(r"^(.+)([.?,:;!])$", re.UNICODE) # Bragantino,2006.In fix this shit if stok: try: self.senTokenizer = nltk.data.load('tokenizers/punkt/english.pickle') except LookupError: sys.stderr.write("WARNING: english tokenizer cannot be loaded, nltk.data does not contain in!") sys.stderr.write("WARNING: using an untrained sen_tokenizer") self.senTokenizer = PunktSentenceTokenizer() self.abbrev_set = (set(abbrev_set) if abbrev_set is not None else set()) if pos: if pos_model is not None: self.posTagger = HunposTagger(pos_model, encoding="utf-8") else: self.posTagger = HunposTagger(os.path.join(os.environ['HUNPOS'], 'english.model'), encoding="utf-8") if stem: self.stemmer = WordNetLemmatizer() def tokenize(self, raw): """Runs sentence and then word tokenization. Does some abbreviation- detection to fix false sentence endings.""" sentences = self.sen_tokenize(raw) tokens = [self.word_tokenize(sen) for sen in sentences] for i in reversed(xrange(len(tokens) - 1)): if ( self.is_abbrev(tokens[i][-1]) or NltkTools._abbrevPattern.match(tokens[i][-1]) is not None and not NltkTools.starts_with_upper(tokens[i + 1][0])): tokens[i].extend(tokens[i + 1]) tokens.pop(i + 1) return tokens def sen_tokenize(self, raw): """Tokenizes the raw text into sentences.""" raw = NltkTools.cleanup_puncts(raw) return self.senTokenizer.tokenize(raw) def filter_long_sentences(self, raw, length=1024): """Filters "sentences" (non-whitespace character sequences longer than length) from the text.""" # TODO: This looks nice but it is too generous with memory use return ' '.join(filter(lambda x: len(x) <= length, re.split(r"\s+", raw))) def sen_abbr_tokenize(self, raw): """Tokenizes the raw text into sentences, and tries to handle problems caused by abbreviations and such.""" sentences = self.sen_tokenize(raw) for i in reversed(xrange(len(sentences) - 1)): if (NltkTools._abbrevPattern.search(sentences[i]) is not None and not NltkTools.starts_with_upper(sentences[i + 1])): sentences[i] = ' '.join(sentences[i:i+2]) sentences.pop(i + 1) return sentences @staticmethod def starts_with_upper(text): """Checks if the sentence starts with an upper case letter.""" t = text.lstrip() return len(t) > 0 and t[0].isupper() @staticmethod def cleanup_puncts(raw): pos = 0 cleaner = NltkTools._cleanerPattern.search(raw[pos:]) while cleaner: if cleaner.group(2) == "." and not cleaner.group(3)[0].isupper(): pos = cleaner.end() elif cleaner.group(1)[-1].isdigit() and cleaner.group(3)[0].isdigit(): pos = cleaner.end() else: changed_part_string = cleaner.expand(r"\1\2 \3\4") raw = raw[:cleaner.start()] + changed_part_string + raw[cleaner.end():] pos = cleaner.end() cleaner = NltkTools._cleanerPattern.search(raw, pos) return raw def is_abbrev(self, tok): return tok in self.abbrev_set def word_tokenize(self, sen): """Tokenizes the sentence to words and splits the sentence ending punctuation mark from the last word and adds it as the last token.""" tokens = self.wordTokenizer.tokenize(sen) if len(tokens) == 0: return [] punktMatchObject = self.punktSplitter.match(tokens[-1]) if punktMatchObject is not None and not self.is_abbrev(tokens[-1]): tokens = tokens[:-1] + list(punktMatchObject.groups()) return tokens def pos_tag(self, sentokens): return self.posTagger.tag(sentokens) def stem(self, tokens): return ((tok, pos, self.stemmer.lemmatize(tok, penn_to_major_pos[pos])) for tok, pos in tokens) def tag_raw(self, raw_text): """Convenience method for tagging (a line of) raw text. The NltkTools instance must have been initialized with C{pos=True, stem=True, tok=True}. It is a generator: returns attribute array of one word at a time. The attributes are the word, the pos tag and the stem.""" sens = self.tokenize(raw_text) pos_tagged = list(self.pos_tag(sen) for sen in sens) stemmed = list(self.stem(pos_tagged_sen) for pos_tagged_sen in pos_tagged) for sen in stemmed: for tok in sen: yield tok yield [] return
class Word_Tokenizer(): def __init__(self): self.tokenizer = PunktWordTokenizer() def tokenize(self, sentence): return self.tokenizer.tokenize(sentence)
def get_tokens(text, remove_stopwords=True): tokenizer = PunktWordTokenizer() return [term for term in tokenizer.tokenize(text.lower()) \ if (len(term) > 1 or term.isalpha()) and \ (term not in stopwords.words('english') or (not remove_stopwords))]