def identify_language(self, document, default_lang = None): # Extract ngrams unigrams = regexp_tokenize(document, pattern_unigrams) bigrams = regexp_tokenize(document, pattern_bigrams) #Create frequency distributions doc_fdist = FreqDist(unigrams + bigrams) predicted_lang = default_lang max_sim = 0.5 for k,v in self._prototypes.items(): sim = cosineOnDicts(v, doc_fdist, self._union) if sim > max_sim: max_sim = sim predicted_lang = k return predicted_lang
def get_score(self, document, lang): # Extract ngrams unigrams = regexp_tokenize(document, pattern_unigrams) bigrams = regexp_tokenize(document, pattern_bigrams) #Create frequency distributions doc_fdist = FreqDist(unigrams + bigrams) sim = cosineOnDicts(self._prototypes[lang], doc_fdist, self._union) return sim
def find_version(text): digit_pattern = r"(?:(\d+)\.)?(?:(\d+)\.)?(\*|\d+)" pattern = "\s?[vV]ersion\s?" + digit_pattern pattern += "| [vV]er\s?\.?\s?" + digit_pattern pattern += "| [vV]\s?\.?\s?" + digit_pattern version_matches = regexp_tokenize(text, pattern) pattern = digit_pattern + "$" versions = [] for version in version_matches: matches = regexp_tokenize(version, pattern) for match in matches: versions.append(match) return versions
def are_duplicates(doc1, doc2): if len(doc1) > 50 and len(doc2) > 50 and not are_duplicates(doc1[:50], doc2[:50]): return False txt_tokens_1 = regexp_tokenize(doc1, pattern_words) txt_tokens_2 = regexp_tokenize(doc2, pattern_words) ngrams_1 = txt_tokens_1 + generate_ngrams(txt_tokens_1, 2) ngrams_2 = txt_tokens_2 + generate_ngrams(txt_tokens_2, 2) overlap = len([w for w in ngrams_1 if w in ngrams_2]) score = (2*overlap)/(len(ngrams_1) + len(ngrams_1) + 1) if score > 0.8: return True else: return False
def main(): args = argument_parser.main() global sql sql = SQLConnector(host=args.host, port=args.port, user=args.user, passwd=args.password, db=args.db) global bing bing = BingSearch() global new_software new_software = NewSoftware() global possible_tags possible_tags = [] mongo = MongoConnector(host=args.H, db=args.db) for page in range(1): res = sql.load_data(page) rows = res.num_rows() if not rows: print "No tweets left to analyse" break for _i_ in range(1): # rows): for tweet in res.fetch_row(): tweet_id = str(tweet[0]) text = tweet[1].lower() # text = "Version 2 Microsoft just released MS Office ver 3.20.2 for 99 cent 100c 10ps 13pence 10 pence" urls = find_url(text) for url in urls: text = text.replace(url, "").strip() versions = find_version(text) words = regexp_tokenize(text, pattern=r"\w+([.,]\w+)*|\S+") # print words prices = find_price(words) pos_ = pos(words) ngram = ngrams(words, 5) try: tagged_tweet = tag_tweets(ngram, tweet_id) tagged_tweet.add("tweet_text", text) tagged_tweet.add("sentiment", tweet[2]) tagged_tweet.add("url", urls) tagged_tweet.add("version", versions) tagged_tweet.add("price", prices) if tweet_id in possible_tags: print tweet_id else: if tagged_tweet.contains("software_id") or tagged_tweet.contains("operating_system_id"): print tweet print tagged_tweet print # mongo.insert(tagged_tweet) else: print tweet, "No software" # sql.setTagged(tagged_tweet.get('tweet_db_id')) except IncompleteTaggingError, e: # This will allow the tweet to be tagged again at a later stage print tweet_id + ":", e print tweet print
def word_split(text): """ Split a text in words. Returns a list of tuple that contains word. """ a = regexp_tokenize(text.lower().strip(), pattern=r'\w+') return a
def simhash(raw_text): """Compute the simhash value for a string.""" fdist = FreqDist() for word in regexp_tokenize(raw_text, pattern=r'\w+([.,]\w+)*|\S+'): fdist.inc(word.lower()) v = [0] * 128 for word in fdist: projection = bitarray() projection.fromstring(hashlib.md5(word).digest()) #print "\tw:%s, %d" % (word, fdist[word]) #print "\t\t 128 bit hash: " + str(b) for i in xrange(128): if projection[i]: v[i] += fdist.get(word) else: v[i] -= fdist.get(word) hash_val = bitarray(128) hash_val.setall(False) for i in xrange(128): if v[i] > 0: hash_val[i] = True return hash_val
def tokenizeList(tokenList): #remove stop words, punctuation & stem words to create tokens out of phrases and names tokenized_list = [] for item in tokenList: tokenized = regexp_tokenize(item.lower(), "[\w']+") for word in tokenized: if word not in english_stops: stemmed = stemmer.stem(word).encode('ascii', 'ignore').lstrip().lower().translate(None,string.punctuation) if not stemmed.isalpha(): if stemmed.isdigit(): stemmed = 'NUMBER' tokenized_list.append(stemmed) elif stemmed.isalnum(): stemmed = 'ALPHANUM' tokenized_list.append(stemmed) else: tokenized_list.append(stemmed) ''' filtered = [word for word in tokenized if word not in english_stops] stemmed = [stemmer.stem(word).encode('ascii', 'ignore').lstrip().lower().translate(None,string.punctuation) for word in filtered] stemmed = [word for word in stemmed if word !=''] tokenized_list.extend(stemmed) ''' return tokenized_list
def __init__(self, sentence): self.sentence = sentence self.forms = [] for s in tuple(open(FORMS, "r")): # read the user_forms from file self.forms.append([w for w in regexp_tokenize(s, "[-\w]+") if w.isalnum()]) if self.is_valid(): self.tokens = regexp_tokenize(self.sentence, "(\\$)|[-\w]+") # tokenizing with regex self.stop_words = set(stop.words("english")) # filtering tokens words to remove self.filtered = [w.lower() for w in self.tokens if w not in self.stop_words] # remove stop words self.spell_checked = self.spell_check() self.tags = pos_tag(self.spell_checked, tagset="universal") # speech tagging (identification) print(self.tags) self.digits = self.get_digits() self.user_form = self.get_user_form()
def getTokenizedQueries(): queriesFileName = "../cacm.query" f = open(queriesFileName, 'r') i = 0 queriesList = {} isText = False for lineWithEnter in f: line = lineWithEnter[:-1] if len(line) == 0: continue elif line[0] == '<' or (line[0] == ' ' and len(line) == 1): isText = False continue else: if not isText: isText = True queriesList[i] = "" queriesList[i] += line i += 1 else: queriesList[i - 1] += " " queriesList[i - 1] += line # print line tokenizedQueriesList = {} for q in queriesList: tokenizedQueriesList[q] = regexp_tokenize(queriesList[q], pattern='[\d]+[\.\,\d]*[\d]+\%?|\[\d+\]|[\w\-]+') return tokenizedQueriesList
def tag_and_tokenize(self,file): '''Tokenize, Chuncks and tags string 's' the bulk of the script work (time) is done here''' self.text = get_docx_text(file) self.sentences = "" print("Tokenize and tagging...") self.sentences = regexp_tokenize(self.text, pattern='\w+|\$[\d\.]+|\S+') self.sentences = [st.tag(self.sentences)] print("Tagging done")
def words(text, splitContractions=False, contractChars = ["'"]): '''uses a regexpTokenizer to tokenize text to words. If splitContractions is true, the regex pattern is [\w]+ so that contractions are split, e.g. "I can't" -> ['I','can','t'], otherwise the regex pattern is [\w']+ so that contractions are not split, i.e. "I can't" -> ['I', "can't"] Additional contract characters, e.g. a hyphen, can be added by over riding the contractChars arg''' if splitContractions: pat = "[\w]+" else: pat = "[\w{0}]+".format(reduce(lambda x,y: x+y, contractChars, "")) return regexp_tokenize(text, pat, discard_empty=True)
def index(request): if request.method == "POST": if request.POST.get("tokens"): with open(settings.BASE_DIR+"/data/corpus.pkl", 'rb') as handle: corpus = pickle.load(handle) tokens = ast.literal_eval(request.POST.get("tokens")) tagged = [] i = 1 for item in tokens: tagged.append((item,request.POST.get("token_"+str(i)))) i += 1 if tagged not in corpus: corpus.append(tagged) with open(settings.BASE_DIR+"/data/corpus.pkl", 'wb') as handle: pickle.dump(corpus, handle) tag_set = unique_list(tag for sent in corpus for (word,tag) in sent) symbols = unique_list(word for sent in corpus for (word,tag) in sent) trainer = HiddenMarkovModelTrainer(tag_set, symbols) hmm = trainer.train_supervised(corpus, estimator=LaplaceProbDist) with open(settings.BASE_DIR+"/data/hmm.pkl", 'wb') as handle: pickle.dump(hmm, handle) return render(request, 'tagger/index.html', {'corpus': corpus}) else: if request.POST.get("random") == 'true': address = get_random_address() if not address: return render(request, 'tagger/index.html', {'error_message': 'No random addresses left'}) else: address = request.POST.get("address") tokens = regexp_tokenize(address, pattern=r'\d+|[^\r\n\t\f 0-9,]+|,', ) if tokens: pkl_file = open(settings.BASE_DIR+"/data/hmm.pkl", 'rb') hmm = pickle.load(pkl_file) pkl_file.close() tagged = hmm.tag(tokens) tags_file = open(settings.BASE_DIR+"/data/tags.json", 'rb') reader = codecs.getreader("utf-8") tags = json.load(reader(tags_file)) tags_file.close() return render(request, 'tagger/index.html', {'address': address, 'tokens': tokens, 'tagged': tagged, 'tags': sorted(tags.items(), key=operator.itemgetter(1)) }) return render(request, 'tagger/index.html', {})
def getReviews(rootdir): reviews = [] unique = [] for folder, subs, files in os.walk(rootdir): for filename in files: with open(os.path.join(folder,filename),'r') as src: review = src.read() words = regexp_tokenize(review,"\w+") for word in words: unique.append(word) reviews.append(review) return reviews
def _tokenize_content(self): tokenized_content = [] raw_content = self._clean_content() content_sents = sent_tokenize(raw_content) content_words_by_sents = map(lambda sent: word_tokenize(sent), content_sents) stopwords = regexp_tokenize(STOPWORDS, "[\w']+") extra_puncts = ['),', ').', '%),', '%).', '):', '()', '://', '>.', '.;', '...', '/>.'] puncts = list(punctuation) + extra_puncts stopwords.extend(puncts) for sent in content_words_by_sents: clean_sent = [word for word in sent if word not in stopwords] tokenized_content.append(clean_sent) return tokenized_content
def get_features(review,polarity): features = {} uniqueWords = 0 personalRatio = 0 personal = 0 misspelt = 0 hotelName = 0 personalPronouns = ["i","me","we","our","ours","mine"] sentences = sent_tokenize(review) sent = nltk.word_tokenize(review) s = len(sentences) wordsR = regexp_tokenize(review,"\w+") for x in wordsR: if x in personalPronouns: personal+=1 #if x not in set(words.words()): #misspelt+=1 if x in hotels: hotelName+=1 w = len(wordsR) unique = len(set(wordsR)) uniqueWords+=unique review = review.replace(" ","") c = len(review) cap = 0 features['dollar'] = False for i in range(len(review)): if review[i].isupper: cap+=1 if review[i] == '$': features['dollar'] = True ari =4.71*(float(c)/w)+0.5*(float(w)/s)-21.43 capRatio = c/float(s) personalRatio += float(personal)/w features['uniqueWords'] = uniqueWords features['personalRatio'] = personalRatio features['ari'] = ari features['capRatio'] = capRatio features['polarity'] = polarity features['hotel'] = hotelName ngrams = get_bigrams(review,'x') sentiments = get_sentimentFeatures(review,'x') for x in ngrams.keys(): features[x] = ngrams[x] for x in sentiments.keys(): features[x] = sentiments[x] features['misspelt'] = misspelt return features
def tokenize_text(page_text): """ Tokenizes text using NLTK and regEx """ pattern = r"""(?:[A-Z][.])+|([1-9]|1[0-2]|0[1-9]){1}(:[0-5][0-9][aApP][mM]){1}|([0]?[1-9]|[1|2][0-9]|[3][0|1])[./-]([0]?[1-9]|[1][0-2])[./-]([0-9]{4}|[0-9]{2})|[$?|\-?]\d[\d,.:\^\-/\d]*\d|((mailto\:|(news|(ht|f)tp(s?))\://){1}\S+)|\w+[\w\-\#\@\'.&$]*\w+|[\@|\#|\&]?\w+(\w+)?|[:punct:]""" tokens = regexp_tokenize(page_text.strip().lower(), pattern) tokens = [cleanup(w) for w in tokens] tokens = [w for w in tokens if ((len(w) > 1) and (money(w) or alpha_num(w)))] tokens = [LMTZR.lemmatize(w) for w in tokens] return tokens
def tokenize_text(page_text): ''' Tokenizes text using NLTK and regEx ''' pattern = r'''(?:[A-Z][.])+|([1-9]|1[0-2]|0[1-9]){1}(:[0-5][0-9][aApP][mM]){1}|([0]?[1-9]|[1|2][0-9]|[3][0|1])[./-]([0]?[1-9]|[1][0-2])[./-]([0-9]{4}|[0-9]{2})|[$?|\-?]\d[\d,.:\^\-/\d]*\d|((mailto\:|(news|(ht|f)tp(s?))\://){1}\S+)|\w+[\w\-\#\@\'.&$]*\w+|[\@|\#|\&]?\w+(\w+)?|[:punct:]''' remove_list = ["[", "]", "{", "}", "(", ")", "'", ".", "..", "...", ",", "?", "!", "/", "\"", "\"", ";", ":", "-", "�", "_", "�", "�", "`", "~", "@", "$", "^", "|", "#", "=", "*", "?"]; ## making it to lower case may affect the performance tokens = regexp_tokenize(page_text, pattern) ## Removes unnecessary words wt = [w for w in tokens if ((w not in remove_list) and (len(w) > 1))]; return wt;
def countW(rootdir): reviews = [] unique = [] for folder, subs, files in os.walk(rootdir): for filename in files: with open(os.path.join(folder,filename),'r') as src: review = src.read() words = regexp_tokenize(review,"\w+") for word in words: unique.append(word) reviews.append(review) unique = set(unique) uniqueR = [] for w in unique: if w not in stopwords.words('english'): uniqueR.append(w) print (len(set(uniqueR)))
def calculateAGARI(rootdir): avgARI = 0 count = 0 uniqueWords = 0 personalRatio = 0 dollarCount = 0 personalPronouns = ["i","me","we","our","ours","mine"] hotelName = 0 for folder, subs, files in os.walk(rootdir): for filename in files: with open(os.path.join(folder, filename), 'r') as src: review = src.read() personal = 0 sentences = sent_tokenize(review) s = len(sentences) capitals = 0 words = regexp_tokenize(review,"\w+") for x in words: if x in personalPronouns: personal+=1 if x in hotels: hotelName+=1 w = len(words) unique = len(set(words)) uniqueWords+=unique review = review.replace(" ","") flag = "f" for i in range(len(review)): if review[i].isupper(): capitals+=1 if review[i] == '$': flag = "t" if flag=="t": dollarCount+=1 c = len(review) ari =4.71*(float(c)/w)+0.5*(float(w)/s)-21.43 avgARI += ari count += 1 personalRatio += float(personal)/w #print(nltk.ne_chunk(review)) print("\n"+rootdir) print("ARI : "+str(float(avgARI/count))) print("Unique words"+" "+str(uniqueWords/float(count))) print("Ratio personal : "+str(personalRatio/float(count))) print("DollarCount :"+str(dollarCount))
def _tag(self, tweet): tweet_id = str(tweet[0]) original = tweet[1].decode('utf-8', 'ignore') text = original.lower().replace('#','').strip() #text = "download 60 hundred pounds 72 million $800 billion pounds holiday havoc v2 in itunes for free 99" urls = find_url(original) for url in urls: text = text.replace(url.lower(), "").strip() word_freqs = word_frequencies(text) #print word_freqs versions = find_version(text) words = regexp_tokenize(text, pattern=r'\w+([.,]\w+)*|\S+') prices = find_price(words) five_gram = self._create_ngram(tokenized=words, gram_length=5) tagged_tweet = self._ngram_tagger(five_gram, tweet_id) tagged_tweet.add('sentiment', tweet[2]) tagged_tweet.add('tweet', original) tagged_tweet.add('url', urls) tagged_tweet.add('version', versions) tagged_tweet.add('price', prices) if tagged_tweet.contains('software_name'): query = {'software_name':tagged_tweet.get('software_name')} words = {} for w in word_freqs: words['words.'+w] = word_freqs[w] #print query, words self._mongo.update_freqs(query,words) return tagged_tweet
import nltk from nltk.tokenize import regexp_tokenize import math import collections from operator import itemgetter if __name__ == '__main__': # preparing files for read & write wordsContent = open ('words.txt', 'rU') documentContent = open ('documents.txt', 'rU') numDocsContainWordContent = open('numDocsContainWord.txt', "w", 0) # read words into a wordList wordList = [] for line in wordsContent: tabs = line.split('\t') wordList.append(tabs[0]) # read documents into a documentDict documentDict = {} for line in documentContent: tabs = line.split('\t') documentDict[tabs[0]] = regexp_tokenize(tabs[1], "[\w'#@]+") # save numDocsContainWord into a file for word in wordList: count = 0 for documentId in documentDict.keys(): if (word in documentDict[documentId]): count += 1 numDocsContainWordContent.write("%s\t%s\n" % (word, count))
def _tokenize_document(self, document): return regexp_tokenize(document, pattern_words)
def renltk_tokenize(text): text = clean_text(text) words = regexp_tokenize(text, pattern='\s+', gaps=True) return words
def tokenize_grafs(text): return regexp_tokenize(text, r'\<\/p\>', gaps=True)
from nltk.tokenize import regexp_tokenize str_in = input() x = int(input()) result = regexp_tokenize(str_in, r"[A-Za-z]+") print(result[x])
def _separate_text_into_words(text: str) -> List[str]: regex = r'\w+' return regexp_tokenize(text, regex)
print(re.match(pattern2, sentences[0])) #%% # Regex with NLTK tokenization # ============================================================================= tweets = [ 'This is the best #nlp exercise ive found online! #python', '#NLP is super fun! <3 #learning', 'Thanks @datacamp :) #nlp #python' ] # Define a regex pattern to find hashtags: pattern1 pattern1 = r"#\w+" # Use the pattern on the first tweet in the tweets list regexp_tokenize(tweets[0], pattern1) # Write a pattern that matches both mentions and hashtags pattern2 = r"([@]\w+)" # Use the pattern on the last tweet in the tweets list regexp_tokenize(tweets[-1], pattern2) # Use the TweetTokenizer to tokenize all tweets into one list tknzr = TweetTokenizer() all_tokens = [tknzr.tokenize(t) for t in tweets] print(all_tokens) #%% # Non-ascii tokenization # =============================================================================
def is_spanish(document): stopwords = nltk.corpus.stopwords.words('spanish') for word in regexp_tokenize(document, "[\w#@]+"): if word in stopwords: return True return False
# -*- coding: utf-8 -*- """ Created on 2018/6/17 @author: Samuel @Desc: @dependence: Noting """ input_str = "Hi everyone! Hola gr8 &*$" print(input_str.split()) from nltk.tokenize import word_tokenize, regexp_tokenize, wordpunct_tokenize, blankline_tokenize output_str = word_tokenize(input_str) print('word_tokenize: ') print(output_str) output_str = regexp_tokenize(input_str, pattern='\w+') print('regexp_tokenize: ') print(output_str) output_str = regexp_tokenize(input_str, pattern='\d+') print('regexp_tokenize: ') print(output_str) output_str = wordpunct_tokenize(input_str) print('wordpunct_tokenize: ') print(output_str) output_str = blankline_tokenize(input_str) print('blankline_tokenize: ') print(output_str)
from nltk.tokenize import sent_tokenize from nltk.tokenize import regexp_tokenize sentence = sent_tokenize(input())[int(input())] print(regexp_tokenize(sentence, "[A-z']+"))
# part-1 # Import the necessary modules from nltk.tokenize import regexp_tokenize from nltk.tokenize import TweetTokenizer # part-2 # Define a regex pattern to find hashtags: pattern1 pattern1 = r"#\w+" # Use the pattern on the first tweet in the tweets list hashtags = regexp_tokenize(tweets[0], pattern1) print(hashtags) # part-3 # Write a pattern that matches both mentions (@) and hashtags pattern2 = r"([@|#]\w+)" # Use the pattern on the last tweet in the tweets list mentions_hashtags = regexp_tokenize(tweets[2], pattern2) print(mentions_hashtags) # part-4 # Use the TweetTokenizer to tokenize all tweets into one list tknzr = TweetTokenizer() all_tokens = [tknzr.tokenize(t) for t in tweets] print(all_tokens)
from nltk.probability import FreqDist from nltk.tokenize import regexp_tokenize from nltk.corpus import stopwords with open('wotw.txt', 'r') as file: data = file.read() tokens = [word.lower() for word in regexp_tokenize(data, '\w+')] stoplist = stopwords.words('english') without_stops = [word for word in tokens if word not in stoplist] freq_dist = FreqDist(without_stops) print('Number of words: %s' % len(freq_dist)) for key in freq_dist.keys(): print(key, freq_dist[key]) print(freq_dist.most_common(10)) print(freq_dist.most_common()[-10:]) dist_1 = [item[0] for item in freq_dist.items() if item[1] == 1] print(len(dist_1), dist_1)
from nltk.tokenize import regexp_tokenize text = input() n = int(input()) print(regexp_tokenize(text, "[A-z']+")[n])
import nltk from nltk.tokenize import regexp_tokenize sent="Don't hesitate to ask questions" print(regexp_tokenize(sent, pattern='\w+|\$[\d\.]+|\S+'))
from nltk.tokenize import sent_tokenize from nltk.tokenize import word_tokenize, regexp_tokenize, wordpunct_tokenize, blankline_tokenize with open('sentence1.txt', 'r') as myfile: data = myfile.read().replace('\n', '') sentences = sent_tokenize(data) from nltk.stem import PorterStemmer from nltk.stem.lancaster import LancasterStemmer from nltk.stem import WordNetLemmatizer pst = PorterStemmer() lst = LancasterStemmer() wnl = WordNetLemmatizer() print("Stemming / lemmatization results") for token in regexp_tokenize(sentences[0], pattern='\w+'): print(token, pst.stem(token), lst.stem(token), wnl.lemmatize(token))
import nltk from nltk.tokenize import regexp_tokenize from operator import itemgetter if __name__ == '__main__': content = open ('10.txt','rU') output = open('wordlist10.txt', "w", 0) words = {} for line in content: for word in regexp_tokenize(line, "[\w#@]+"): # we define ours words to contain ', # and @ words[word] = words.get(word, 0) + 1 for item in sorted(words.items(), key=itemgetter(1), reverse=True): output.write ("%s\t%s\n" % (item[0], item[1]))
''' from nltk.tokenize import regexp_tokenize from nltk import ne_chunk from nltk import pos_tag from nltk import download download('maxent_ne_chunker') download('words') #Claro está, NLTK trae por defecto el inglés bien pulido. El español nope. frase = "Steven could be the main character, but Peridot is the coolest. \ Stevonnie too, you must like Stevonnie. If not, please go to live \ to New Jersey and leave Beach City" frases_separadas = regexp_tokenize(frase, '\w+') frases_tag = pos_tag(frases_separadas) print("NLTK NER: ", ne_chunk(frases_tag, binary=False)) ''' Podemos usar también un NER de Stanford, o al menos una versión del mismo que debería ser mejor que el de NLTK. ''' from nltk.tag.stanford import StanfordNERTagger stanford_ner = StanfordNERTagger( 'assets/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', 'assets/stanford-ner/stanford-ner.jar') #Al igual que en el 04, aquí le pasamos todo el texto, sin separar ni nada. #lo hará él por nosotros. print("Stanford NER: ", stanford_ner.tag(frases_separadas)) ''' Recordad que aquí buscábamos nombres, entidades, lugares, organizaciones... En este caso, Stanford reconoce mejor estos valores, ya que como veréis, para
from nltk.tokenize import regexp_tokenize from nltk.tokenize import TweetTokenizer # hashtags pattern1 = r"#\w+" regexp_tokenize(tweets[0], pattern1) # mentions and hashtags pattern2 = r"([@#]\w+)" regexp_tokenize(tweets[-1], pattern2) tknzr = TweetTokenizer() all_tokens = [tknzr.tokenize(t) for t in tweets] print(all_tokens)
import nltk from nltk.tokenize import RegexpTokenizer, regexp_tokenize, BlanklineTokenizer, WhitespaceTokenizer, LineTokenizer, SpaceTokenizer from nltk.tokenize.util import spans_to_relative, string_span_tokenize text = "Don't hesitate to ask questions" text2 = "This is a breaking news.\n A godzilla has been discovered in Tokyo city." tokenizer = RegexpTokenizer('[\w]+') print tokenizer.tokenize(text) print regexp_tokenize(text, pattern='\w+|\$[\d\.]+\S+') # Tokenize whitespace tokenizer = RegexpTokenizer('\s+', gaps=True) print tokenizer.tokenize(text) # Select only words starting with capital letters capt = RegexpTokenizer('[A-Z]\w+') print capt.tokenize(text2) print BlanklineTokenizer().tokenize(text2) print WhitespaceTokenizer().tokenize(text2) print LineTokenizer(blanklines='keep').tokenize(text2) print LineTokenizer(blanklines='discard').tokenize(text2) # SpaceTokenizer works similar to .split('') print SpaceTokenizer().tokenize(text2)
def getAmenitiesTokens(cell_val): return regexp_tokenize(cell_val, "([\w\s\d']+), ")
rgx_wrd_digit re.findall(rgx_wrd_digit, 'he has 11 cats') # [A-Za-z\-\.]+ ==== matches 'My-Website.com' # groups are different # (a-z) matches "a-z" tweet0 = r'This is the best #nlp exercise ive found online! #python' print(tweet0) from nltk.tokenize import regexp_tokenize from nltk.tokenize import TweetTokenizer # hash tags pattern1 = r"#\w+" regexp_tokenize(tweet0, pattern1) tweet2 = r'Thanks @datacamp :) #nlp #python' # mentions and hashtags pattern2 = r"([#|@]\w+)" regexp_tokenize(tweet2, pattern2) tknzr = TweetTokenizer() lst_tweets = [ 'This is the best #nlp exercise ive found online! #python', '#NLP is super fun! <3 #learning', 'Thanks @datacamp :) #nlp #python' ] all_tokens = [tknzr.tokenize(t) for t in lst_tweets] all_tokens
# Import the necessary modules from nltk.tokenize import TweetTokenizer from nltk.tokenize import regexp_tokenize tweets = [ 'This is the best #nlp exercise ive found online! #python', '#NLP is super fun! <3 #learning', 'Thanks @datacamp :) #nlp #python' ] # Define a regex pattern to find hashtags: pattern1 pattern1 = r"#\w+" # Use the pattern on the first tweet in the tweets list print(regexp_tokenize(tweets[0], pattern1)) # Write a pattern that matches both mentions and hashtags pattern2 = r"([#|@]\w+)" # Use the pattern on the last tweet in the tweets list print(regexp_tokenize(tweets[-1], pattern2)) # Use the TweetTokenizer to tokenize all tweets into one list tknzr = TweetTokenizer(tweets) all_tokens = [tknzr.tokenize(t) for t in tweets] print(all_tokens) ###################################################### from nltk.tokenize import word_tokenize german_text = 'Wann gehen wir Pizza essen? � Und fährst du mit Über? 🚕' # Tokenize and print all words in german_text
#1.b Write a pattern to match sentence endings: print(re.findall("[.?!]",my_string)) print(re.split('[A-Za-z]*[.?!]\?!',my_string)) #1.d Find all capitalized words in my_string and print the result print(re.findall(r"[A-Z]\w+",my_string)) #1.e Split my_string on spaces and print the result print(re.split(r"\s",my_string)) #1.f Find all digits in my_string and print the result print(re.findall(r"\d+", my_string)) #2.b Tokenize all the sentences using the sent_tokenize() function. r1 = sent_tokenize(my_string) print(r1) #2.c Tokenize the fourth sentence, which you can access as sentences[3], using the word_tokenize() function. print(word_tokenize(r1[3])) #2.d Find the unique tokens by using word_tokenize() and then converting it into a set using set(). r2 = word_tokenize(my_string) r3 = set(r2) print(r3) my_string_2 = "SOLDIER #1: Found them? In Mercea? The coconut's tropical!" string = re.split(r"#\d\w+\?!",my_string_2) print(string) tweets=['I’ve retrieved the best #nlp exercise for the class till now! It is done #python', '#NLP is super cool! #learning', 'Thanks @analytics :) #nlp #python'] pattern1 = r"#\w+" print(regexp_tokenize(tweets[1],pattern1)) pattern2 = r"[#|@]\w+" print(regexp_tokenize(tweets[2],pattern2)) tknzr = TweetTokenizer() all_tokens = [tknzr.tokenize(t) for t in tweets] print(all_tokens)
def build_vocab(self, unlabeled_data, labeled_data, embedding_size, max_seq_num, max_seq_len): sentences = [] words = [] if unlabeled_data is not None: for (u, v) in unlabeled_data.items(): try: results = re.compile(r'http[a-zA-Z0-9.?/&=:#%_-]*', re.S) dd = results.sub(" <website> ", v) results = re.compile(r'www.[a-zA-Z0-9.?/&=:#%_-]*', re.S) dd = results.sub(" <website> ", dd) results = re.compile( r'[a-zA-Z0-9.?/&=:#%_-]*.(com|net|org|io|gov|me|edu)', re.S) dd = results.sub(" <website> ", dd) sents = sentence_tokenize(dd) for j in range(0, len(sents)): a = regexp_tokenize(transform_format(sents[j]), self.pattern) temp = [] for k in range(0, len(a)): if a[k] not in self.english_punctuations and check_ack_word( a[k]) == 1: if a[k].isdigit(): a[k] = '<number>' elif a[k][0] == '$': a[k] = '<money>' elif a[k][-1] == '%': a[k] = '<percentage>' temp.append(a[k].lower()) words.append(a[k].lower()) if len(temp) > 0: sentences.append(temp) except: #print(u,v) #exit() pass if labeled_data is not None: for (u, v) in labeled_data.items(): for i in range(0, len(v[0])): v[0][i] = str(v[0][i]) try: results = re.compile(r'http[a-zA-Z0-9.?/&=:#%_-]*', re.S) dd = results.sub(" <website> ", v[0][i]) results = re.compile(r'www.[a-zA-Z0-9.?/&=:#%_-]*', re.S) dd = results.sub(" <website> ", dd) results = re.compile( r'[a-zA-Z0-9.?/&=:#%_-]*.(com|net|org|io|gov|me|edu)', re.S) dd = results.sub(" <website> ", dd) except: print(u, v) print(v[0][i]) exit() a = regexp_tokenize(transform_format(dd), self.pattern) temp = [] for k in range(0, len(a)): if a[k] not in self.english_punctuations and check_ack_word( a[k]) == 1: if a[k].isdigit(): a[k] = '<number>' elif a[k][0] == '$': a[k] = '<money>' elif a[k][-1] == '%': a[k] = '<percentage>' temp.append(a[k].lower()) words.append(a[k].lower()) if len(temp) > 0: sentences.append(temp) word_frequency = {} for i in range(0, len(words)): if words[i] in word_frequency: word_frequency[words[i]] += 1 else: word_frequency[words[i]] = 1 self.model = gensim.models.Word2Vec(sentences, size=embedding_size, window=5, min_count=1, iter=20, negative=50) x = 4 self.word2id['<pad>'] = 0 self.id2word[0] = '<pad>' self.word2id['<sos>'] = 2 self.id2word[2] = '<sos>' self.word2id['<eos>'] = 3 self.id2word[3] = '<eos>' self.unk_count = 0 for i in range(0, len(sentences)): for j in range(0, len(sentences[i])): if word_frequency[sentences[i][j].lower()] >= 2: if sentences[i][j].lower() in self.model: if sentences[i][j].lower() in self.word2id: pass else: self.word2id[sentences[i][j].lower()] = x self.id2word[x] = sentences[i][j].lower() x = x + 1 else: self.word2id['<unk>'] = 1 self.id2word[1] = '<unk>' self.unk_count += 1
from nltk.tokenize import regexp_tokenize sentence = input() index_of_word = int(input()) pattern = "[A-z]+" res = regexp_tokenize(sentence, pattern) print(res[index_of_word])
def load_data(self, labeled_data, ids): self.message = {} labels_esit = [] for i in ids: sentences = [] labels = [] doc_len = [] sent_len = [] sents, l = labeled_data[i] for j in range(0, len(sents)): sents[j] = str(sents[j]) results = re.compile(r'www.[a-zA-Z0-9.?/&=:#%_-]*', re.S) dd = results.sub(" <website> ", sents[j]) results = re.compile(r'http[a-zA-Z0-9.?/&=:#%_-]*', re.S) dd = results.sub(" <website> ", dd) results = re.compile( r'[a-zA-Z0-9.?/&=:#%_-]*.(com|net|org|io|gov|me|edu)', re.S) dd = results.sub(" <website> ", dd) a = regexp_tokenize(transform_format(dd), self.pattern) temp = [] for k in range(0, len(a)): if a[k] not in self.english_punctuations and check_ack_word( a[k]) == 1: if a[k].isdigit(): a[k] = '<number>' elif a[k][0] == '$': a[k] = '<money>' elif a[k][-1] == '%': a[k] = '<percentage>' temp.append(a[k].lower()) if len(temp) > 0: temp_ = ['<sos>'] for k in range(0, min(len(temp), self.max_seq_len - 2)): temp_.append(temp[k]) temp_.append('<eos>') sentences.append(temp_) labels.append(self.lookup_label_id(l[j])) labels_esit.append(self.lookup_label_id(l[j])) sent_len.append(len(temp_) - 1) doc_len.append(len(sents) - 1) self.message[i] = (sentences, labels, sent_len, doc_len) x_d = set() for (u, v) in self.label_set.items(): x_d.add(v) x_d = np.array(list(x_d)) self.kde.fit(np.array(labels_esit)[:, None]) self.dist = self.kde.score_samples(x_d[:, None]) self.esit_dist = F.softmax(torch.tensor(self.dist), dim=-1)
def tokenize_sentence(sentence): #return nltk.word_tokenize(sentence.lower()) return regexp_tokenize(sentence.lower(), "[\w']+")
def load_data(self, unlabeled_data, ids): self.message = {} self.ids = [] self.data_num = 0 for i in ids: try: sentences = [] labels = [] doc = unlabeled_data[i] doc_len = [] sent_len = [] doc += '.' results = re.compile(r'http[a-zA-Z0-9.?/&=:#%_-]*', re.S) dd = results.sub(" <website> ", doc) results = re.compile(r'www.[a-zA-Z0-9.?/&=:#%_-]*', re.S) dd = results.sub(" <website> ", dd) results = re.compile( r'[a-zA-Z0-9.?/&=:#%_-]*.(com|net|org|io|gov|me|edu)', re.S) dd = results.sub(" <website> ", dd) sents = sentence_tokenize(dd) # print(sents) for j in range(0, len(sents)): a = regexp_tokenize(transform_format(sents[j]), self.pattern) temp = [] for k in range(0, len(a)): if a[k] not in self.english_punctuations and check_ack_word( a[k]) == 1: if a[k].isdigit(): a[k] = '<number>' elif a[k][0] == '$': a[k] = '<money>' elif a[k][-1] == '%': a[k] = '<percentage>' temp.append(a[k].lower()) if len(temp) > 0: temp_ = ['<sos>'] for k in range(0, min(len(temp), self.max_seq_len - 2)): temp_.append(temp[k]) temp_.append('<eos>') sentences.append(temp_) labels.append(10) sent_len.append(len(temp_) - 1) doc_len.append(min(len(sents) - 1, self.max_seq_num - 1)) self.message[i] = (sentences[:self.max_seq_num], labels[:self.max_seq_num], sent_len[:self.max_seq_num], doc_len) self.ids.append(i) except: if str(doc) != "nan": print(doc) pass
from nltk.tokenize import regexp_tokenize def readfile(filename): messages = [line.rstrip() for line in open('SMSSpamCollection.txt')] # print (input) return messages filestopwords = open("stopwords.txt").read() def casefolding(input): text = [] for row in input: text.append(row) text = [x.lower() for x in text] return text messages = readfile('SMSSpamCollection.txt') print(messages) print(casefolding(messages)) training_files = glob.glob('SMSSpamCollection.txt') for file_name in training_files: text = open(file_name, encoding="UTF-8").read() filename.append(file_name.split("\\")[1]) tokens = regexp_tokenize(text, r'[A-Za-z]{3,}') data.append(tokens)
def tokenize(s): if type(s) is not unicode: s = s.decode('utf8') return regexp_tokenize(s, pattern='[^\W_]+|\S')
# Store most common words and ngrams for latter comparison of texts words_most_common = [] ngrams_most_common = [] # Load stopwords if necessary if params.s: stopwords = [sw.strip() for sw in params.s.readlines()] params.s.close() #Process documents for f in params.files: txt = f.read().lower() f.close() # Tokenize txt_tokens = regexp_tokenize(txt, pattern_words) if params.s: txt_tokens = [token for token in txt_tokens if token not in stopwords] # Extract ngrams unigrams = regexp_tokenize(txt, pattern_unigrams) bigrams = regexp_tokenize(txt, pattern_bigrams) #Create frequency distributions fdist_words = FreqDist(txt_tokens) fdist_ngrams = FreqDist(unigrams + bigrams) # Store most common words and ngrams for latter comparison of texts words_most_common.append([k for (k,_) in fdist_words.most_common(params.n)]) ngrams_most_common.append([k for (k,_) in fdist_ngrams.most_common(params.m)]) outputname = "output_for_" + f.name.rsplit(os.sep, 2)[1]
def get_tri_grams(corpus): words = regexp_tokenize(corpus, "[\\S]+") tri_grams = [[f'{words[i]} {words[i + 1]}', words[i + 2]] for i in range(len(words) - 2)] return tri_grams
def is_portuguese(document): stopwords = nltk.corpus.stopwords.words('portuguese') for word in regexp_tokenize(document, "[\w#@]+"): if word in stopwords: return True return False
text_file = f.read() text_file = text_file.decode('utf-8').strip() # splits sentences from nltk.tokenize import sent_tokenize tokens = sent_tokenize(text_file) # print tokens # splits words from nltk.tokenize import word_tokenize tokens = word_tokenize(text_file) # print tokens # whitespace tokenizer from nltk.tokenize import regexp_tokenize tokenizer = regexp_tokenize(text_file, '\s+', gaps=True) # print tokenizer from nltk.corpus import stopwords english_stops = set(stopwords.words('english')) words = tokenizer # print [word for word in words if word not in english_stops] #look up words and print synset from nltk.corpus import wordnet syn = wordnet.synsets('cookbook')[0] print syn.name() print syn.definition() print syn.hypernyms() print syn.hypernyms()[0].hyponyms() print syn.root_hypernyms()
__author__ = 'Peiman' import nltk #nltk.download() #from nltk.book import * x = 'welcome to my first sentiment analysis and natural language processing learning codes!.' from nltk.tokenize import regexp_tokenize tokenizer = regexp_tokenize(r'\w+') tokenized = nltk.word_tokenize(x) tagged = nltk.pos_tag(tokenized) print tokenized print tagged
from nltk.tokenize import regexp_tokenize text = input() print(regexp_tokenize(text, r"[A-z'\-]+"))
def prepare_data(self, data=False, re_train=False): flag = True if data == False: a = open('manner.xml').readlines() sent = [] for k in a: k = k.lower() st = k.find('<subject>') if st == -1: continue end = k.find('</subject>') sent.append(k[st + 9:end - 1]) data = sent flag = False #print data[0:5] sentence = ["%s %s %s" % (self.START, x, self.END) for x in data] tokenize_sent = [ regexp_tokenize(x, pattern='\w+|$[\d\.]+|\S+') for x in sentence ] freq = nltk.FreqDist(itertools.chain(*tokenize_sent)) print 'found ', len(freq), ' unique words' if self.vocab_size > len(freq): self.vocab_size = len(freq) self.vocab = freq.most_common(self.vocab_size - 3) index_to_word = [x[0] for x in self.vocab] index_to_word.append(self.unk_token) index_to_word.append(self.START) index_to_word.append(self.END) word_to_index = dict([(w, i) for i, w in enumerate(index_to_word)]) for i, sent in enumerate(tokenize_sent): tokenize_sent[i] = [ w if w in word_to_index else self.unk_token for w in sent ] self.char_indices = word_to_index self.indices_char = index_to_word if re_train == True or flag == True: sentences = [] next_chars = [] sentences_f = [] sentences_b = [] next_chars_f = [] next_chars_b = [] for sent in tokenize_sent: temp = [self.START for i in range(self.step)] flag = False for word in sent: temp.remove(temp[0]) temp.append(word) if flag == True: next_chars_f.append(word) if word != self.END: temp1 = [] for i in temp: temp1.append(i) sentences_f.append(temp1) flag = True for sent in tokenize_sent: temp = [self.END for i in range(self.step)] flag = False for word in sent[::-1]: temp.remove(temp[0]) temp.append(word) if flag == True: next_chars_b.append(word) if word != self.START: temp1 = [] for i in temp: temp1.append(i) sentences_b.append(temp1) flag = True print('preparing forward backward windows...') sentences, next_chars = [], [] sentences.extend(sentences_f) sentences.extend(sentences_b) next_chars.extend(next_chars_f) next_chars.extend(next_chars_b) X_data = [] for i in sentences: temp = [] for j in i: temp.append(word_to_index[j]) X_data.append(temp) y_data = [] for i in next_chars: y_data.append(self.char_indices[i]) #y_train = np_utils.to_categorical(y_data, vocab_size) y_train = np.zeros((len(sentences), self.vocab_size), dtype=np.bool) #X_train = sequence.pad_sequences(X_data, maxlen=maxlen) for i in range(len(y_data)): y_train[i][y_data[i]] = True self.X_data = X_data self.y_data = y_train
from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer("[\w']+") print(tokenizer.tokenize("Can't is a contraction.")) # or from nltk.tokenize import regexp_tokenize print(regexp_tokenize("Can't is a contraction.", "[\w']+")) # use gaps tokenizer = RegexpTokenizer('\s+', gaps=True) print(tokenizer.tokenize("Can't is a contraction."))