def bag_of_words(data, label_codebook, feature_codebook, theta): """""" word_dict = Alphabet() stopset = set(stopwords.words('english')) for key, value in data.items(): label_codebook.add(key) for doc in value: doc_tokens = set(nltk.regexp_tokenize(doc, pattern="\w+")) for word in doc_tokens: if word not in stopset: word_dict.add(word) all_words = word_dict._label_to_index.keys() fdict = FreqDist([w for w in all_words]) word_feature = fdict.keys()[theta:] for word in all_words: if word in word_feature: feature_codebook.add(word) instance_list = {} for label, document_list in data.items(): instance_list[label] = [] for document in document_list: vector = np.zeros(feature_codebook.size()) tokens = set(nltk.regexp_tokenize(document, pattern="\w+")) indice = 0 for word in tokens: if feature_codebook.has_label(word): indice = feature_codebook.get_index(word) vector[indice] = 1. instance_list[label].append(vector) return instance_list
def get_freqs(text): stop_words = nltk.corpus.stopwords.words('english') frequencies = defaultdict(int) pattern = r'''(?x) # set flag to allow verbose regexps ([A-Z]\.)+ # abbreviations, e.g. U.S.A. | \w+(-\w+)* # words with optional internal hyphens | \$?\d+(\.\d+)?%? # currency and percentages, e.g. $12.40, 82% | \.\.\. # ellipsis | [][.,;"'?():-_`] # these are separate tokens ''' if type(text) == list: print 'number of links: '+ str(len(text)) for t in text: content = t['content'] tokens = nltk.regexp_tokenize(content, pattern) for word in tokens: if len(word) > 2 and word.lower() not in stop_words: cap = word[0].upper() + word[1:] frequencies[cap] += 1 else: tokens = nltk.regexp_tokenize(text, pattern) for word in tokens: if len(word) > 2 and word not in stop_words: frequencies[word] += 1 print "frequency size: "+str(len(frequencies)) return frequencies
def load(f=str): import re files = open(f) raw = files.read() pattern = re.compile(r"""\$?\d+(\.\d+)?%? # currency \d+/\d+/\d+ #dates""", re.VERBOSE) nltk.regexp_tokenize(raw, pattern)
def nltkTest(): s = "russia licenza 8.1.5 U.S." res = nltk.regexp_tokenize(s, helper.nltkPattern) print(res) s = "Saldo vs. Fattura n. 2015/004" res = nltk.regexp_tokenize(s, helper.nltkPattern) print(res)
def regularExpressionTokenizer(): text = 'That U.S.A. poster-print costs $12.40...' pattern = r'''(?x) # set flag to allow verbose regexps ([A-Z]\.)+ # abbreviations, e.g. U.S.A. | \w+(-\w+)* # words with optional internal hyphens | \$?\d+(\.\d+)?%? # currency and percentages, e.g. $12.40, 82% | \.\.\. # ellipsis | [][.,;"'?():-_`] # these are separate tokens ''' print nltk.regexp_tokenize(text, pattern)
def get_links(text): # checks only for 'http://...' and 'www...' text = text + " " pat = "http://.*?\s" links = nltk.regexp_tokenize(text, pat) text = " " + text + " " pat = "\swww\..*?\..*?\s" links.extend(nltk.regexp_tokenize(text, pat)) links = map(lambda x: x[:-1], links) return links
def poss_test(test_file,test_write,sw_file): """ Arguments: - `train_file`: """ a = 0 f = open(test_file) reader = csv.reader(f) t = open(test_write,"w") sw = open(sw_file) sw = sw.readlines() sw = [word.strip() for word in sw] stopwords = sw print "停顿词表长度",len(stopwords) stopwords = set(stopwords) g = lambda x : x not in stopwords for row in reader: if a == 0: a += 1 continue if a%1000 == 0: print a a += 1 #if a == 8: # sys.exit(1) title = row[1].lower() #clean html body = nltk.clean_html(row[2].lower()) #work tokenize pattern = r"([a-z])\w+" body = nltk.regexp_tokenize(body, pattern) title = nltk.regexp_tokenize(title, pattern) #light stem #title = set([stem(word) for word in title]) #body = set(body) #body = set([stem(word) for word in body]) #remove stopwords #body = filter(g,body) #title = filter(g,title) body = ' '.join(body) title = ' '.join(title) t.write('%s , %s \n'%(title,body))
def poss_test(test_file,test_write,sw_file): """ Arguments: - `train_file`: """ a = 0 f = open(test_file) reader = csv.reader(f) t = open(test_write,"w") sw = open(sw_file) sw = sw.readlines() sw = [word.strip() for word in sw] #stopwords = sw stopwords = nltk.corpus.stopwords.words('english') stopwords = set(stopwords) g = lambda x : x not in stopwords for row in reader: if a%10000 == 0: print(a) a += 1 #if a == 8: # sys.exit(1) title = row[1].lower() #clean html body = nltk.clean_html(row[2].lower()) #work tokenize pattern = r"(\.?[a-z][a-z0-9\+\.\#\-]+[a-z0-9\+\#])" body = nltk.regexp_tokenize(body, pattern) title = nltk.regexp_tokenize(title, pattern) #remove stopwords body = filter(g,body) title = filter(g,title) #light stem title = set([stem(word) for word in title]) body = set(body) body = set([stem(word) for word in body]) body = ' '.join(body) title = ' '.join(title) t.write('"%s","%s","%s"\n'%(row[0],title,body))
def query_episode(self, show_title, ep_title, se_number, ep_number, runtime): """build video list prior to scoring """ qres = {} # Query 1 qlist = (show_title, ep_title) # Search YouTube tmp = self.search('%s %s' % qlist) for k, v in tmp.items(): qres[k] = v # Query 2 qlist = (show_title, ep_title, se_number, ep_number) # Search YouTube tmp = self.search('%s %s %s %s' % qlist) for k, v in tmp.items(): qres[k] = v # Query 3 qlist = (show_title, se_number, ep_number) # Search YouTube tmp = self.search('%s s%02de%02d' % qlist) for k, v in tmp.items(): qres[k] = v # Show tokens sh_stem = [self._lancaster.stem(t) \ for t in nltk.regexp_tokenize( show_title.encode('utf8'), r"\w+")] # Episode stem tokens if exist if ep_title: ep_stem = [self._lancaster.stem(t) \ for t in nltk.regexp_tokenize( ep_title.encode('utf8'), r"\w+")] else: ep_stem = None res = {'Output': qres, 'Input': {},} res['Input']['show_title'] = show_title res['Input']['ep_title'] = ep_title res['Input']['sh_stem'] = sh_stem res['Input']['ep_stem'] = ep_stem res['Input']['se_number'] = se_number res['Input']['ep_number'] = ep_number res['Input']['runtime'] = runtime return res
def poss_train(train_file,train_write,sw_file): """ Arguments: - `train_file`: """ a = 0 f = open(train_file) reader = csv.reader(f) t = open(train_write,"w") sw = open(sw_file) sw = sw.readlines() sw = [word.strip() for word in sw] #stopwords = sw # use nltk stopwords stopwords = nltk.corpus.stopwords.words('english') print "停顿词表长度",len(stopwords) stopwords = set(stopwords) g = lambda x : x not in stopwords for row in reader: if a%100000 == 0: print a a += 1 title = row[1].lower() #clean html body = nltk.clean_html(row[2].lower()) #word tokenize pattern = r"([a-z])\w+" body = nltk.regexp_tokenize(body, pattern) title = nltk.regexp_tokenize(title, pattern) #remove stopwords body = filter(g,body) title = filter(g,title) #light stem #st = LancasterStemmer() title = set([stem(word) for word in title]) body = set(body) body = set([stem(word) for word in body]) # list to string body = ' '.join(body) title = ' '.join(title) t.write('"%s","%s","%s","%s"\n'%(row[0], title,body,row[3]))
def normalized(text, lowercase=True, fix=True, tuples=False): """Tokenize, remove capitalization and exclude punctuation """ if fix: text = fix_text(unicode(text)) pattern = r"""(?x) # verbose regexps \w+(-\w+)* # words with optional internal hyphens """ result = [w for w in nltk.regexp_tokenize(text, pattern)] if lowercase: result = [w.lower() for w in nltk.regexp_tokenize(text, pattern)] if tuples: result = tuple(result) return result
def handleSubject1(outputFile): """ :return: dict """ index = 0 termdict = dict() subjectList = list() f = open("data/topic/subject1_w_date.txt") for item in f: array = item.strip().split("DELIMER") count = array[0] subject = array[3] for (regex, repl) in helper.regexList.items(): subject = regex.sub(repl, subject) for s in helper.specialSet: subject = subject.replace(s, "") termList = nltk.regexp_tokenize(subject, helper.nltkPattern) # use nltk-package to participle the subject s = "" for term in termList: if term.lower() not in helper.excludeSet: s += term + " " # reconstruct the subject if term not in termdict: termdict[term.strip()] = index index += 1 if s != "": regex = re.compile("\s+") s = regex.sub(" ", s) subjectList.append("{}DELIMER{}DELIMER{}DELIMER{}".format(count, array[1], array[2], s.strip())) fileHelper.writeIterableToFile(outputFile, subjectList) return termdict
def tokenprocess(Strtext): f = open(Strtext) raw = f.read().strip() stop_words = stopwords.words('english') pattern = r'''(?x)([A-Z]\.)+|\w+(-\w+)*|\$?\d+(\.\d+)?%?|\.\.\.|[][.,;"'?():-_`]''' text1 = map(lambda word:word.lower(),nltk.regexp_tokenize(raw,pattern)) text1_filter = [word for word in text1 if len(word) > 1 and word.find("'") == -1 and word not in stop_words] return text1_filter #eassylist = corpus_data() #print eassylist[-1] #stop_words = stopwords.words('english') #print stop_words
def tag(text): tokens = nltk.regexp_tokenize(text, SENTENCE_REGEX) pos_tokens = nltk.tag.pos_tag(tokens) chunker = nltk.RegexpParser(GRAMMAR) tree = chunker.parse(pos_tokens) terms = Tagger.get_terms(tree) return Tagger.word_list(terms)
def russian_get_text(inp, output): # parse tweets from .csv file by Julia Rubtsova # from 'Метод построения и анализа корпуса коротких текстов для задачи классификации отзывов' data = read_data(inp) res = [] pattern = '''"(.*?)";''' for line in data: tokens = nltk.regexp_tokenize(line, pattern) if len(tokens) < 4: continue text = tokens[3][1:-2] mentions = get_mentions(text) links = get_links(text) hashtags = get_hashtags(text) text = process(text) sname = tokens[2][1:-2] if text == '': continue row = [text, 'not-given', sname, 'not-given', ','.join(hashtags), ','.join(mentions), ','.join(links)] row = '\t'.join(row) res.append(row) write_data(output, res)
def ShowCollocations(): text.insert(END, "If this doesn't work, please check you have NLTK, PyYAML and the stopword list from the NLTK loaded. See Help for details \n\n\n") import nltk from nltk.collocations import BigramCollocationFinder from nltk.collocations import TrigramCollocationFinder from nltk.metrics import BigramAssocMeasures from nltk.metrics import TrigramAssocMeasures pattern = r'''(?x)([A-Z]\.)+|\w+([-']\w+)*|\$?\d+(\.\d+)?%?|\.\.\.|[][.,;"'?():-_']''' data = resultsbox.get(1.0,END) rawtext=nltk.regexp_tokenize(data, pattern) prepcolloc = [word.lower() for word in rawtext if not word in stopwords and word.isalpha()] text.delete(1.0, END) text.insert(END, "Collocations (occurring at least 3 times with a PMI of 10)\n") text.insert(END, "\nBigram Collocations:\n") bigram = BigramAssocMeasures() bigramfinder = BigramCollocationFinder.from_words(prepcolloc) bigramfinder.apply_freq_filter (3) bigrams=bigramfinder.nbest(bigram.pmi, 10) for item in bigrams: first = item[0] second = item[1] text.insert(END, first) text.insert(END, " ") text.insert(END, second) text.insert(END, "\n")
def main(self, text): """Breaks a single string into a tree using the grammar and returns the specified words as a string.""" if text is None: return None try: text = text.encode("ascii", "ignore") except: text = text.decode("utf-8", "ignore").encode("ascii", "ignore") chunker = nltk.RegexpParser(grammar) toks = nltk.regexp_tokenize(text, sentence_re) postoks = nltk.tag.pos_tag(toks) #print postoks tree = chunker.parse(postoks) terms = self.get_terms(tree) words = self.get_words(terms) return words
def longitud_promedio_palabras_moens(lista): regexp = "[a-zA-Z'ÁÉÍÓÚáéíóúñÑüÜ]+" total_palabras_en_oraciones = 0 num_oraciones = 0 tokens = 0 promedio_longitud_palabras_oraciones = [] for oracion in lista: total_palabras_oracion = 0 num_palabras_oracion = 0 tokens = nltk.regexp_tokenize(oracion, regexp) total_palabras_en_oraciones += len(tokens) for palabra in tokens: total_palabras_oracion += len(palabra) num_palabras_oracion += 1 #print palabra #print len(palabra) if total_palabras_oracion > 0: promedio_longitud_palabras_oraciones.append(total_palabras_oracion/num_palabras_oracion) else: print oracion #print len(tokens) #total += len(oracion.split()) num_oraciones += 1 #promedio = total_palabras_en_oraciones / num_oraciones #print promedio_longitud_palabras_oraciones suma_promedios=0 num_promedios = 0 for promedios in promedio_longitud_palabras_oraciones: suma_promedios += promedios num_promedios += 1 promedio = suma_promedios/num_promedios #promedio = sum(promedio_longitud_palabras_oraciones)/float(len(promedio_longitud_palabras_oraciones)) return promedio
def classif(text, mass, num_all_docs, num_words_unic): stm = Stemmer('russian') text = stm.stemWords(regexp_tokenize((text.decode('UTF-8')).lower(), r"(?x) \w+ | \w+(-\w+)*")) num_povt_words = 0 summa = 0 while_iter = 0 while while_iter < len(mass): summand_1 = log((mass[while_iter].num_docs + 0.0) / (num_all_docs + 0.0) + 0.0, 1.1) for i in text: for i1 in mass[while_iter].lst_allword: if i == i1: num_povt_words = num_povt_words + 1 summand_2 = log(((num_povt_words + 1) + 0.0) / ((num_words_unic + mass[while_iter].num_words) + 0.0), 1.1) num_povt_words = 0 summa = summa + summand_2 mass[while_iter].c = summand_1 + summa summa = 0 while_iter = while_iter + 1 max_c = -100000 while_iter = 0 number_max = 0 while while_iter < len(mass): print mass[while_iter].c if mass[while_iter].c > max_c: max_c = mass[while_iter].c number_max = while_iter while_iter = while_iter + 1 print mass[number_max].name_categories
def numero_puntuacion_moens(texto): regexp = "[/,$?:;!()&%#=+{}*~.]+" tokens = nltk.regexp_tokenize(texto, regexp) total = len(tokens) print len(tokens) print tokens return total
def word_couple_con_puntuacion_pares_minusculas(lista): word_couples = [] regexp = "[a-zA-Z'ÁÉÍÓÚáéíóúñÑüÜ]+-*[a-zA-Z'ÁÉÍÓÚáéíóúñÑüÜ]+|[a-zA-Z'ÁÉÍÓÚáéíóúñÑüÜ]+|[.]+|[/,$?:;!()&%#=+{}*~.]+|[0-9]+" for oracion in lista: #oracion = str(oracion) #oracion = oracion.to_lower #print oracion tokens = nltk.regexp_tokenize(oracion.lower(), regexp) #print len(tokens) # tokens_lower = [] # for i in range(len(tokens)): # palabra = str(tokens[i]) # tokens_lower.append(palabra.to_lower() ) pairs = list(itertools.permutations(tokens, 2)) for pair in pairs: word_couples.append(pair[0]+"~"+pair[1]) return word_couples
def tokenize_punctuation(t): """Tokenizes the punctuation in a text 't'.""" pattern = r'''(?x) # set to be verbose \W # searches for non-alphanumeric characters. ''' matches = nltk.regexp_tokenize(t, pattern) return matches
def extract(self, text): ''' Extract and freudify noun phrases from text, return all succesfully freudified noun phrases. ''' toks = nltk.regexp_tokenize(text, self.sentence_re) postoks = nltk.tag.pos_tag(toks) tree = self.chunker.parse(postoks) terms = self._get_terms(tree) phrases = sets.Set() # Loop through all the noun phrases and try to freudify them. for term in terms: if (len(term)) < 2: continue changed = False context = "" phrase = [] for part in term: word, tag = part word = word.encode('ascii', 'replace') phrase.append(word.lower()) rpl = self.replace_word(tag[:2], word) if len(rpl[2]) > 0: context = rpl[2] phrase[-1] = rpl[0] changed = True if changed: phrase = " ".join(phrase).strip() phrase.encode('ascii', 'replace') phrase = str(phrase) if phrase not in self.own_phrases[context]: phrases.add((str(phrase), context)) phrases = list(phrases) return phrases
def generate_vocab(papers): """Returns the vocabulary used in the papers given in parameters, after cleaning and stopwords removal. Args: papers (list of tuples): the raw list of papers from which generates the vocabulary (each element is a tuple of 3 strings: id, title and abstract) Returns: list of strings: the list of tokens forming the vocabulary """ sc = StringCleaner() # Generate author's vocabulary corpus = " ".join(p[1] + " " + p[2] for p in papers) # Cleaning corpus = sc.clean_string(corpus) # Tokenization pattern = r"(?:[A-Z]\.)+|\w+(?:-\w+)*|\d+(?:\.\d+)?%?" # we keep tokens that are words (with optional internal hyphens), acronyms and percentages tokens = set(nltk.regexp_tokenize(corpus, pattern)) - set(nltk.corpus.stopwords.words("english")) num_re = re.compile("^\d+$") tokens = set([t for t in tokens if not num_re.match(t)]) # we remove only-numeric tokens # Stemming porter = nltk.stem.PorterStemmer() return [porter.stem(t) for t in tokens]
def AO_lTokenize(AO_sText): ''' This brreakes a text into individual words Adapted From Natural Language Processing with Python ''' regex = r'''(?xi) (?:H|S)\.\ ?(?:(?:J|R)\.\ )?(?:Con\.\ )?(?:Res\.\ )?\d+ # Bills | ([A-Z]\.)+ # Abbreviations (U.S.A., etc.) | ([A-Z]+\&[A-Z]+) # Internal ampersands (AT&T, etc.) | (Mr\.|Dr\.|Mrs\.|Ms\.) # Mr., Mrs., etc. | \d*\.\d+ # Numbers with decimal points. | \d\d?:\d\d # Times. | \$?[,\.0-9]+\d # Numbers with thousands separators, (incl currency). | (((a|A)|(p|P))\.(m|M)\.) # a.m., p.m., A.M., P.M. | \w+((-|')\w+)* # Words with optional internal hyphens. | \$?\d+(\.\d+)?%? # Currency and percentages. | (?<=\b)\.\.\.(?=\b) # Ellipses surrounded by word borders | [][.,;"'?():-_`] ''' # Strip punctuation from this one; solr doesn't know about any of it tokens = regexp_tokenize(AO_sText, regex) # tokens = [re.sub(r'[.,?!]', '', token) for token in tokens] # instead of this we just test word length return tokens
def handle_doc(word_set,rs_path): doc_dir = os.listdir(rs_path) doc_matrix = [] doc_cat = [] for docs in doc_dir: files = os.listdir(rs_path+docs) print "start to handle the --> "+docs for file_d in files: d_path = rs_path+docs+'/'+file_d #get the single file path with open(d_path,'rb') as text_file: str_tmp = '' file_lines = text_file.readlines() for line in file_lines: pattern = r'''[a-zA-Z]+''' tokens = nltk.regexp_tokenize(line,pattern) for t in tokens: if t.lower() in word_set: str_tmp += t.lower() str_tmp += ' ' doc_matrix.append(str_tmp) doc_cat.append(cat_dic[docs]) text_file.close() str_tmp = '' for sw in word_set: str_tmp += sw str_tmp += ' ' doc_matrix.append(str_tmp) doc_cat.append('NAN') vectorizer = CountVectorizer() doc_num = vectorizer.fit_transform(doc_matrix) tfidf = TfidfTransformer() doc_tfidf = tfidf.fit_transform(doc_num) return doc_tfidf[:-1,:],doc_cat[:-1]
def word_segment(data, mark_stop, english_stop): """ 分词并去除停用词 :param data: :param stopwords_list: """ """ segment_text = nltk.word_tokenize(data.replace('.', ' ')) segment_text = [word.lower() for word in segment_text if word.lower() not in (english_stop + mark_stop)] segment = nltk.pos_tag(segment_text) #词性标注 """ pattern = r"""(?x)([A-Z]\.)+|\w+(-\w+)*|\$?\d+(\.\d+)?%?|\.\.\.|[][.,;'"?():-_`]""" segment_text = nltk.regexp_tokenize(data, pattern) # 可选择取词干 # porter = nltk.PorterStemmer() segment_text = [t.lower() for t in segment_text if t.lower() not in (english_stop + mark_stop)] segment = nltk.pos_tag(segment_text) # 词性标注 segment_list = [] for item in segment: segment_list.append(item[0] + "," + item[1]) return segment_list
def tokenize_tag_text(description): """Removes some punctuation, tags each word by part-of-speech, and generates keyword and keyword prhases based on noun phrases patterns using regexp.""" sentence_re = r'''(?x) ([A-Z])(\.[A-Z])+\.? # set flag to allow verbose regexps | \w+(-\w+)* # words with optional internal hyphens | \$?\d+(\.\d+)?%? # currency and percentages | \.\.\. # ellipsis | [][.,;"?():-_`] # separate tokens ''' grammar = r""" NBAR: {<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns {<NNP|NNPS>+<IN>?<NNP|NNPS>+} # A sequence of proper nouns connected with zero or more prepositions {<DT|PP\$>?<JJ>*<NN|NNS>} # Determiners (e.g. 'the', 'a') or possessive, followed by one or more adjective {<NN>+} # A sequence of one or more nouns NP: {<NBAR>} {<NBAR><IN><NBAR>} """ chunker = nltk.RegexpParser(grammar) toks = nltk.regexp_tokenize(description, sentence_re) postoks = nltk.tag.pos_tag(toks) tree = chunker.parse(postoks) return tree
def compute_df(self, document_list): '''Compute document frequency based on input document list''' df_cache = dict() df_output = dict() d_index = 0 for document in document_list: d_index += 1 # tokenize each document reg_toks = nltk.regexp_tokenize(document, SENTENCE_RE) for item in reg_toks: # change each word to lower case and lemmatize item = normalise(item) if item not in df_cache: df_cache[item] = set([d_index]) else: df_cache[item].add(d_index) for item in df_cache: if acceptable_word(item): df_output[item] = len(df_cache[item]) df_output['total_document'] = len(document_list) return df_output
def parse(self, response): fd = nltk.FreqDist() punct = set(string.punctuation) self.i += 1 titles = Selector(response=response).xpath('//title/text()').extract() filename = response.url.split("/")[-2] filedir = dirs[self.i - 1] + '/' + filename filedir = 'Top' + '/' + filedir.split('/')[1] + '-' + filedir.split('/')[2] print dirs[self.i - 1] print filedir temp = stripAllTags(response.body) s = MLStripper() s.feed(temp) pure_body = s.get_data() pure_body = pure_body.lower() pure_body = unicodedata.normalize('NFKD', pure_body).encode('ASCII', 'ignore') for word in nltk.regexp_tokenize(pure_body, pattern=r'\.|(\s+)', gaps = True): if word not in punct and word not in common_words: fd.inc(word) freq_tuples = fd.items() if not(os.path.exists(filedir)): os.makedirs(filedir) filedir = filedir + '/' + filename with open(filedir, 'wb') as f: f.write('@attribute ' + filedir.split('/')[1] + ' {0,1}\n\n' + '@data\n') f.write('\n') for title in titles: f.write(title.encode('utf-8').strip()) f.write('\n') for item in freq_tuples: i = 0 while i < item[1]: i+=1 f.write(item[0] + '\n')
def URLDECODE(XSS): XSS = XSS.lower() XSS = unquote(unquote(XSS)) XSS, num = re.subn(r'\d+', "0", XSS) XSS, num = re.subn(r'(http|https)://[a-zA-Z0-9\.@&/#!#\?]+', "http://u", XSS) r = ''' (?x)[\w\.]+?\( |\) |"\w+?" |'\w+?' |http://\w |</\w+> |<\w+> |<\w+ |\w+= |> |[\w\.]+ ''' return nltk.regexp_tokenize(XSS, r)
def get_docs(file_name): in_put = open(file_name, 'rU') raw = in_put.readlines() #保留有摘要的paper 序号 paper_id = [raw.index(w) for w in raw if w != 'null\n'] raw = [w.lower() for w in raw if w != 'null\n'] docs = [nltk.regexp_tokenize(w, pattern) for w in raw] #只保留英文单词,有-的会被删掉,删除停用词 for i in xrange(len(docs)): docs[i] = [w for w in docs[i] if w.isalpha() and w not in stop_word] wnl = nltk.WordNetLemmatizer() #词形还原 for i in xrange(len(docs)): docs[i] = [wnl.lemmatize(t) for t in docs[i]] in_put.close() return docs
def computeSentiment(text): # Tokenize and remove stop words tokens = [] for t in nltk.regexp_tokenize(text.lower(), '[a-z]+'): if t not in sr: tokens.append(t) tokens[:10] # Count the number of positive and negative words. pos_count = 0 neg_count = 0 for t in tokens: if t in pos_words: pos_count += 1 elif t in neg_words: neg_count += 1 # Compute sentiment if (pos_count + neg_count) > 0: sentiment = float(pos_count - neg_count) / float(pos_count + neg_count) else: sentiment = 0 return sentiment
def GeneSeg(payload): #数字泛化为"0" payload = payload.lower() #payload=unquote(unquote(payload)) 这里已经解过码了,所以不用再解码了 payload, num = re.subn(r'\d+', "0", payload) #替换url为”http://u payload, num = re.subn(r'(http|https)://[a-zA-Z0-9\.@&/#!#\?]+', "http://u", payload) #分词 r = ''' (?x)[\w\.]+?\( |\) |"\w+?" |'\w+?' |http://\w |</\w+> |<\w+> |<\w+ |\w+= |> |[\w\.]+ ''' return nltk.regexp_tokenize(payload, r)
def text_parse(cls, x): try: sentence = x.strip().lower() except: sentence = x sentence = re.sub(cls.hndl_regex, cls.hndl_repl, sentence) # 匹配替换@*** sentence = re.sub(cls.hash_regex, cls.hash_repl, sentence) # 匹配替换#*** sentence = re.sub(cls.url_regex, cls.url_repl, sentence) # 匹配替换URL sentence = re.sub(cls.rpt_regex, cls.rpt_repl, sentence) # 匹配替换类似yoooooooo为yoo emoticons_regex = [(repl, re.compile(cls.regex_union(cls.escape_paren(regx)))) for (repl, regx) in cls.emoticons] # 匹配替换表情 for (repl, regx) in emoticons_regex: sentence = re.sub(regx, ' ' + repl + ' ', sentence) pattern = r""" (?x)(?:[a-z]\.)+ | \d+(?:\.\d+)?%?\w+ | \w+(?:[-']\w+)* | (?:[-.!?]{2,}) | [][.,;"'?():$-_*`]""" word_list = nltk.regexp_tokenize(sentence, pattern) return word_list
def talk_to_bot(): vocab = chatbot.read_vocab() vectors = chatbot.read_vectors() # 문제 # 질문과 대답 데이터에서 가장 긴 토큰의 길이를 구하세요. dialog_questions = vectors[::2] dialog_answers = vectors[1::2] max_len_q = max([len(q) for q in dialog_questions]) max_len_a = max([len(a) for a in dialog_answers]) + 1 print(max_len_q, max_len_a) # 9 10 # ------------------------------------------------ # model = tf.keras. onehot = np.eye(len(vocab), dtype=np.float32) while True: sys.stdout.write('왕자: ') sys.stdout.flush() line = sys.stdin.readline() line = line.strip() if '끝' == line: break # 문제 # 입력 문장을 토큰으로 분리하세요. # tokens = line.split() # 공백이 여러 개 일때(?) tokens = nltk.regexp_tokenize(line, r'\w+') # print(tokens) # ['이리', '와서', '나하고', '놀자'] # 문제 # 토큰을 질문으로 변환하세요. (문자열 토큰을 숫자로 변환하세요) question = [vocab.index(t) if t in vocab else chatbot._UNK_ for t in tokens]
def analyze2(text2): # takes a list of comment strings and tokenizes and finds pairs of positive and negative words with specific phone features new=[] tokens=[] count=0 negations=['not', 'too', 'n\'t', 'no', 'cannot', 'neither','nor'] with open("positive-words.txt",'r') as f: positive_words=[line.strip() for line in f] with open("negative-words.txt",'r') as f: negative_words=[line.strip() for line in f] positive_tokens=[] negative_tokens=[] #N is 1 this time, negating word right before pos or neg word reviewpos=[] reviewneg=[] features=["headphones", "battery", "sound", "charge", "screensize", "size", "space", "storage", "camera", "speed", "display", "sensor", "casing", "price"] for text in text2: #text=text.strip(string.punctuation) #text=text.strip(" ") #tokens=nltk.word_tokenize(text) #tokens = re.split(r"\W+", text) pattern=r'\w[\w\'-]*\w' tokens=nltk.regexp_tokenize(text, pattern) tokens=[tokens.lower() for tokens in tokens] #tokens=[token.strip(string.punctuation) for token in tokens] #tokens=[token.strip() for token in tokens if token.strip()!=''] new.append(tokens) #change += to not have seperated list per comment count+=1 for x in new: for i in range(0, len(x)): previ="" if i>0: previ=x[i-1] if previ in positive_words and x[i] in features: reviewpos.append((previ,x[i])) if previ in negative_words and x[i] in features: reviewneg.append((previ,x[i])) return reviewpos, reviewneg
def tknse(s): """ Tokenises a sentence string in a suitable to way to analyse both bible text and twitter data (e.g. catching and filtering out mentions, URLs, ...) """ import string from nltk import regexp_tokenize # define pattern for regexp pattern = [ r'<[^>]+>', # HTML tags (drop) r'(?:@[\w_]+)', # @-mentions (catch and filter drop) #r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags (keep as words) r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs (catch and filter) r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers (drop) r'(?:[A-Z]\.)+', # abbreviations, e.g. U.S.A. #r'\$?\d+(?:\.\d+)?%?', # currency and percentages, e.g. $12.40, 82% (leave as numbers) r'(?:[\w_]+)', # other words r'(?:\S)' # anything else ] pattern = r'(' + '|'.join( pattern) + ')' # collapse into single regex string tok = regexp_tokenize(s, pattern) # tokenise # filter out unwanted tokens tok = list( filter( lambda w: (w[0] not in ['@']) and # @-mentions (w[0:4].lower() != 'http') and # URLs (w.replace('.', '', 1).isdigit() == False) and # numbers (w not in string.punctuation), tok)) # to lower case tok = [w.lower() for w in tok] # lower case only return (tok)
def text_parse(input_text, language='en'): sentence = input_text.strip().lower() sentence = re.sub(r'@\s*[\w]+ | ?#[\w]+ | ?&[\w]+; | ?[^\x00-\xFF]+', '', sentence) special_tag = { '.', ',', '#', '!', '(', ')', '*', '`', ':', '"', '‘', '’', '“', '”', '@', ':', '^', '/', ']', '[', ';', '=', '_' } pattern = r""" (?x)(?:[a-z]\.)+ | \d+(?:\.\d+)?%?\w+ | \w+(?:[-']\w+)*""" word_list = regexp_tokenize(sentence, pattern) filter_word = [] if language == 'en': filter_word = [ w for w in word_list if w not in stopwords.words('english') and w not in special_tag ] # 去停用词和特殊标点符号 word_tag = pos_tag( filter_word, tagset=None, lang=language) # 词性标注,返回标记列表[('Codeine', 'NNP'), ('15mg', 'CD') res_word_list = [] lemmatizer = WordNetLemmatizer() # 词形还原 tag_list = { 'TO', 'RB', 'RBR', 'RBRS', 'UH', 'WDT', 'WP', 'WP$', 'WRB', 'SYM', 'RP', 'PRP', 'PRP$', 'CD', 'POS', ':' } for i in range(0, len(word_tag)): # 去掉副词、介词、小品词、疑问词、代词、人称代词、所有格代名词等 if word_tag[i][1] in tag_list: continue else: word = lemmatizer.lemmatize(word_tag[i][0]) res_word_list.append(word) return res_word_list
def tokenExtractor(file): doc = xml.dom.minidom.parse(file) movieText = "" for item in doc.getElementsByTagName("s"): for child in item.childNodes: if child.nodeName == "#text" and len(re.findall("\w", child.nodeValue)) > 1: movieText += child.nodeValue movieText = re.sub("\n\s+", " ", movieText) movieText = re.sub("\n", "", movieText) pattern = r'''(?x)(?:[A-Z]\.)+ | \w+(?:-\w+)* | ''' tokens = nltk.regexp_tokenize(movieText, pattern) # Remove non alphanumericbetic characters tokens = [w for w in tokens if re.search(r'\w', w)] # Remove stopwords stopwords = nltk.corpus.stopwords.words('english') tokens = [w for w in tokens if w.lower() not in stopwords] stopwords = nltk.corpus.stopwords.words('spanish') tokens = [w for w in tokens if w.lower() not in stopwords] stopwords = nltk.corpus.stopwords.words('french') tokens = [w for w in tokens if w.lower() not in stopwords] stopwords = nltk.corpus.stopwords.words('italian') tokens = [w for w in tokens if w.lower() not in stopwords] # Remove numbers tokens = [w for w in tokens if not re.search(r'\d', w)] # Lower case tokens = [t.lower() for t in tokens] tokens = [item for item in tokens if item.isalpha()] return tokens
def get_pos_tags(text): """Used when tokenizing words""" text = tostring(text) regex_patterns = r"""(?x) # set flag to allow verbose regexps (?:[A-Z]\.)+ # abbreviations, e.g. U.S.A. | \w+(?:-\w+)* # words with optional internal hyphens | \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82% | \.\.\. # ellipsis | [][.,;"'?():-_`] # these are separate tokens """ # POS tagging # postoks = nltk.pos_tag(text.split()) toks = nltk.regexp_tokenize(text, regex_patterns) assert isinstance(toks, list), "toks is not a list of str, cannot tokenize." postoks = nltk.tag.pos_tag(toks) # fix a weird pos-tagging error in NLTK prior_pos = '' for i in range(0, len(postoks)): if prior_pos == 'TO' and 'VB' not in postoks[i][1]: old = postoks.pop(i) postoks.insert(i, (old[0], 'VB')) prior_pos = postoks[i][1] # print('getPOStags_returns:', postoks) return postoks
def keyphrase_sentence(sentence): sentence_re = r'''(?x) (?:[A-Z]\.)+ | \w+(?:-\w+)* | \$?\d+(?:\.\d+)?%? | \.\.\. | [][.,;"'?():_`-] ''' toks = nltk.regexp_tokenize(sentence.lower(), sentence_re) # sentence tokenisation postoks = nltk.tag.pos_tag(toks) for i in range(len(postoks)): if postoks[i][1][0] == 'N' or postoks[i][1] == 'RB' or postoks[i][ 1] == 'DT': # check if 'N'/'RB'/'DT' is appearing in pos list. token_ls = toks[i:len( toks)] ## span of keyphrase ( starting point - 'N'/'RB'/'DT', ## ending point - ending of that sentence ) token_ls = [i for i in token_ls if i not in stop_word_ls ] # remove stopwords from phrases. if len(token_ls) >= 3: return " ".join(token_ls)
def test_handle(word_list,tr_path): docdir_list = os.listdir(tr_path) test_m = [] test_cat = [] for dd in docdir_list: file_list = os.listdir(tr_path+dd) print "handling the---> "+dd+" <---directory.." for fpath in file_list: d_path = tr_path + dd + '/' + fpath with open(d_path,"rb") as text_file: str_tmp = '' test_cat.append(cat_dic[dd]) fl = text_file.readlines() test_por = nltk.PorterStemmer() for doc_line in fl: pattern = r'''[a-zA-Z]+''' tokens = nltk.regexp_tokenize(doc_line,pattern) for t in tokens: if t.lower() in word_list: str_tmp += t.lower() str_tmp += ' ' test_m.append(str_tmp) text_file.close() #最后增加一维把所有的wordset加进去?!try once! str_tmp = '' for sw in word_list: str_tmp += sw str_tmp += ' ' test_m.append(str_tmp) test_cat.append(10) vectorizer = CountVectorizer() doc_m = vectorizer.fit_transform(test_m) tfidf = TfidfTransformer() test_matrix = tfidf.fit_transform(doc_m) #test_matrix = log_sparsematrix(test_matrix) return test_matrix,test_cat
def get_clean_text_pattern(recomposed_note): """Function that filters through the notes, retrieves those that match the specified pattern and removes stopwords.""" pattern = "([a-zA-Z0-9\\\]+(?:'[a-z]+)?)" recomposed_note_raw = nltk.regexp_tokenize(recomposed_note, pattern) # Create a list of stopwords and remove them from our corpus stopwords_list = stopwords.words('english') stopwords_list += list(string.punctuation) # additional slang and informal versions of the original words had to be added to the corpus. stopwords_list += ([ "im", "ur", "u", "'s", "n", "z", "n't", "brewskies", "mcd’s", "Ty$", "Diploooooo", "thx", "Clothessss", "K2", "B", "Comida", "yo", "jobby", "F", "jus", "bc", "queso", "fil", "Lol", "EZ", "RF", "기프트카드", "감사합니다", "Bts", "youuuu", "X’s", "bday", "WF", "Fooooood", "Yeeeeehaw", "temp", "af", "Chipoodle", "Hhuhhyhy", "Yummmmers", "MGE", "O", "Coook", "wahoooo", "Cuz", "y", "Cutz", "Lax", "LisBnB", "vamanos", "vroom", "Para", "el", "8==", "bitchhh", "¯\\_(ツ)_/¯", "Ily", "CURRYYYYYYY", "Depósito", "Yup", "Shhhhh" ]) recomposed_note_stopped = ([ w.lower() for w in recomposed_note_raw if w not in stopwords_list ]) return recomposed_note_stopped
def processPreDiffCode(code): code = re.sub(r'(\"[\s\S]*?\")', '', code, 0, re.I) code = re.sub(r'(@@[\s\S]*?\n)', '', code, 0, re.I) code = re.sub(r'(\+[\s\S]*?\n)', '', code, 0, re.I) result = [] mis = methodInvocationCase.findall(code) for mi in mis: miWords = mi.split('\.') for miWord in miWords: toDeal = [] if camelCase1.match(miWord) or camelCase2.match(miWord): toDeal = splitCode(miWord) elif upperExtCase.match(miWord): toDeal = splitFinalExt(miWord) elif upperCase.match(miWord): toDeal.append(miWord) for deal in toDeal: if not isDelete(deal.lower()): result.append(stemmer.stem(deal)) code = re.sub(r'([A-Za-z0-9_]+\.[A-Za-z0-9_]+)', '', code, 0, re.I) sentences = tokenizer.tokenize(code) for sentence in sentences: words = nltk.regexp_tokenize(sentence, pattern) for word in words: toDeal = [] if camelCase1.match(word) or camelCase2.match(word): toDeal = splitCode(word) elif upperExtCase.match(word): toDeal = splitFinalExt(word) elif upperCase.match(word): toDeal.append(word) for deal in toDeal: if not isDelete(deal.lower()): result.append(stemmer.stem(deal)) return result
def extract_clean_text(json_file): wv = [] cnt = 0 stoplist = load_stoplist() wordnet_lemmatizer = WordNetLemmatizer() with open(json_file, 'r') as json_file: user_tweets = json.load(json_file) for user in user_tweets: text = '' for tweet in user_tweets[user]: text += common.cleanhtml( common.remove_hashtag_sign( common.remove_username( common.remove_url(ftfy.fix_text(tweet))))) + ' ' # clean_texts = [wordnet_lemmatizer.lemmatize(word.lower()) for word in nltk.regexp_tokenize(text, pattern)] clean_texts = [ wordnet_lemmatizer.lemmatize(word.lower()) for word in nltk.regexp_tokenize(text, pattern) if wordnet_lemmatizer.lemmatize(word.lower()) not in stoplist ] wv.append(clean_texts) cnt += 1 logger.info('total tweets: %d;' % cnt) return wv
def tf_text(text_title_summary_reviews, docID): """ Returns a list of filtered terms: (term, (docID, tf/sqrt(len(keywords)))) """ pattern = r'''(?x) # set flag to allow verbose regexps aujourd'hui # exception 1 | prud'hom\w+ # exception 2 | \w' # contractions d', l', j', t', s' | \d+(?:,\d+)?%?€? # currency and percentages, e.g. 12,40€, 82% | (?:[A-Z]\.)+ # abbreviations, e.g. U.S.A. | \w+(?:-\w+)* # words with optional internal hyphens #| [][.,;"'?():_`-] # these are separate tokens; includes ], [ ''' words = nltk.regexp_tokenize(text_title_summary_reviews.lower(), pattern) keywords = [] fdist = FreqDist() for elt in words: if elt[0] in LOADED_LEMMA: try: # on prend le 1e lemma possible meme si ça peut etre faux (ex: abstrait -> abstraire (verbe)) lemma = [x[0] for x in LOADED_LEMMA[elt[0]] if x[0][0] == elt][0][1] except: with open("backend/language/lemma/missing.txt", "a") as f: f.write(unidecode.unidecode(elt) + "\n") lemma = elt if not lemma in stopwords: keywords.append(lemma) fdist = FreqDist(keywords) result = [(x[0], (docID, (1 + log10(x[1])) / sqrt(len(keywords)))) for x in fdist.items()] return result
def pos(text): sentence_re = r'''(?x) ([A-Z])(\.[A-Z])+\.? | \w+(-\w+)* | \$?\d+(\.\d+)?%? | \.\.\. | [][.,;"'?():-_`] ''' lemmatizer = nltk.WordNetLemmatizer() stemmer = nltk.stem.porter.PorterStemmer() grammar = r""" NBAR: {<NN.*|JJ>*<NN.*>} NP: {<NBAR>} {<NBAR><IN><NBAR>} """ chunker = nltk.RegexpParser(grammar) toks = nltk.regexp_tokenize(text, sentence_re) postoks = nltk.tag.pos_tag(toks) #print postoks for word, tag in postoks: #print word,tag if (tag == "NN" or tag == "JJ"): count += 1 if (count >= 2): print "Inside" return true #tree = chunker.parse(postoks) #terms=get_terms(tree) return false
def compute_imprs_word_counts(file_names): texts = extract_texts(file_names) dicts = [extract_dict(t) for t in texts] imprs = [d['IMPRESSION'] for d in dicts] # for split #adapted from https://stackoverflow.com/a/22178786/1469195 # (removed capturing groups) pattern = r'''(?x) # set flag to allow verbose regexps (?:[A-Z]\.)+ # abbreviations, e.g. U.S.A. | \$?\d+(?:\.\d+)?%? # numbers, incl. currency and percentages | \w+(?:[-']\w+)* # words w/ optional internal hyphens/apostrophe | [+/\-@&*] # special characters with meanings ''' words = regexp_tokenize("\n".join(imprs), pattern) words = clean_words(words) counter = Counter(words) result = namedtuple('WordResult', ['counter', 'imprs', 'words'], verbose=False)( counter=counter, imprs=imprs, words=words, ) return result
def __init__(self, dbfile, colText, colCnt, min_support=.01): timer = Timer() self.min_support = min_support dbSize = 0 vocab = {} itemset = [] texts = [] ## load data, tokenize the text, hash vocabulary f = open(dbfile, 'rU') rdr = csv.reader(f, delimiter='\t') fdist = nltk.probability.FreqDist() for r in rdr: text = unicode(r[colText], 'utf-8') tokens = nltk.regexp_tokenize(text, tokenPattern) if colCnt < 0: num = 1 else: num = int(r[colCnt]) text = [] for t in tokens: if not t in stopwords: if not t in vocab: vocab[t] = len(itemset) itemset.append(t) text.append(vocab[t]) if len(text) > 0: texts.append((text, num)) dbSize += num self.dbSize = dbSize self.vocab = vocab self.itemset = itemset self.texts = texts f.close() timer.printElapsed()
def limpiar_texto(texto): '''En esta función el texto se va a tokenizar pero a partir de una expresión regular ''' spanishstemmer=SnowballStemmer('spanish') pattern = r'''(?x) #set flag to allow verbose regexps (?:[A-Z]\.)+ #abbreviations, e.g. U.S.A. | \w+(?:-\w+)* #words with optional internal hyphens(guiones internos) | \$?\d+(?:\.\d+)?%? #currency(dinero) and percentages, e.g. $12.40, 82% ''' #Definiendo stop words(palabras de interrupción) stop_words = set(stopwords.words('spanish')) #convertir a minusculas texto = texto.lower() #aplicando tokenización por medio de la expresión regular texto_tokenizado = nltk.regexp_tokenize(texto,pattern) #quitando palabras de intrerrupción (cerradas) words = [w for w in texto_tokenizado if not w in stop_words] #convirtiendo palabras en raices stems = [spanishstemmer.stem(token) for token in words] return stems
def pos_tag_sent(self, sent, boi_form=True): """ Tag sentence convert sentence from BOI-form to String-form and tag it using the provided tagger :param sent: boi sentence :param pos_tagger: tagger to tag the sentence :return: """ if boi_form: untagged_sent = self.convert_from_boi_to_sent(sent) else: untagged_sent = sent tokens = nltk.regexp_tokenize(untagged_sent, pattern=" ", gaps=True) pos_tagged_sent = self.tagger.tag(tokens) result = [] for i, (_, word) in enumerate(pos_tagged_sent): r = word.split('/') if len(r) == 3: r = ['/', r[-1]] result.append((tuple(r), sent[i][1])) return result
lemmatizer = nltk.WordNetLemmatizer() stemmer = nltk.stem.porter.PorterStemmer() #Taken from Su Nam Kim Paper... grammar = r""" NBAR: {<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns NP: {<NBAR>} {<NBAR><IN><NBAR>} # Above, connected with in/of/etc... """ chunker = nltk.RegexpParser(grammar) toks = nltk.regexp_tokenize(text, sentence_re) postoks = nltk.tag.pos_tag(toks) print(postoks) tree = chunker.parse(postoks) from nltk.corpus import stopwords stopwords = stopwords.words('english') def leaves(tree): """Finds NP (nounphrase) leaf nodes of a chunk tree.""" for subtree in tree.subtrees(filter=lambda t: t.label == 'NP'): yield subtree.leaves()
tokens = nltk.word_tokenize(raw) [porter.stem(t) for t in tokens] #Lemmatization -WordNet lemmatizer wnl = nltk.WordNetLemmatizer() # if you want to compile the vocabulary of some texts and want a list of valid lemmas [wnl.lemmatize(t) for t in tokens] #3.7 Regular Expresssions for Tokenizing Text # simple approach - split on whitespace raw = """'When I'M a Duchess,' she said to herself, (not in a very hopeful tone though), 'I won't have any pepper in my kitchen AT ALL. Soup does very well without--Maybe it's always pepper that makes people hot-tempered,'...""" re.split(r' ',raw) re.split(r'[ \t\n]+', raw) #matches one or more spaces i.e tabs, newlines re.split(r'\W+', raw) #\W split input on anything other than a word character \W = [a-zA-Z0-9] nltk.regexp_tokenize() # is more efficient for this task, text = 'That U.S.A. poster-print costs $12.40..' pattern = r'''(?x) ([A-Z]\.)+ | \w+(-\w+)* | \$?\d+(\.\d+)?%? | \.\.\. | [][.,;"'?():-_`] ''' nltk.regexp_tokenize(text, pattern) #didnt work #Segmentation -Sentence segemantation -word segmentation sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle') text = nltk.corpus.gutenberg.raw('chesterton-thursday.txt') sents = sent_tokenizer.tokenize(text)
def split_str(line): words = nltk.regexp_tokenize(line, tokens_pattern) # print(words) return words
import re str = "aoweijriow aofejofenr wajioejfo er (!#^ &% etc@ 3 $ $%#) wearjwoaieraw awoerj oawier" p = re.compile('\(.*\)') print(p.findall(str)) print("22th".isalnum()) from nltk import regexp_tokenize txt = "Today it's 07.May 2011. Or 2.999." print(regexp_tokenize(txt, pattern=r'\w+([.,]\w+)*|\S+'))
trainable = True if trainable: words = [] # this will contain the root words labels = [] # this will contain the tags docs_patterns = [] # this will contain each pattern list docs_labels = [ ] # this will contain each tag, but many times to get the amount of the tags for intent in data['intents']: for pattern in intent['patterns']: # Equals the list pattern to words_pattern without punctuation marks # ['Hi'] # ['How', 'are', 'you'] # ... words_pattern = nltk.regexp_tokenize(pattern, "(\d+|\w+)") # Extend the list words with the list 'words_pattern' # words = ['Hi', 'How', 'are', 'you', 'Is', 'anyone', 'there', ...] words.extend(words_pattern) # Append the list 'words_pattern' to the list doc_x (not extending) # doc_patterns = [['Hi'], ['How', 'are', 'you'], ['Is', 'anyone', 'there']], ... ] docs_patterns.append(words_pattern) # Adding the tags to the list doc_y # doc_labels = ['greeting', 'greeting', 'greeting', 'greeting', 'greeting', 'goodbye', 'goodbye', ...] docs_labels.append(intent["tag"]) # Append each label one time in list labels # labels = ['greeting', 'goodbye', 'thanks', ...]
def tokenize(self, formula): clean_formula = re.sub(self._REGEX_CLEAN, '', formula) clean_formula = re.sub(self._REGEX_LETTERS, '', clean_formula) tokens = regexp_tokenize(clean_formula, self._REGEX_TOKEN) return tokens
import nltk from nltk.stem import PorterStemmer, WordNetLemmatizer #stemming and lemmatizing from nltk import regexp_tokenize #tokenize import pandas as pd import csv wr = open("stemmed_words.txt", "wr") #write to stemmed word text file with open('words_alpha.csv', 'r') as csvfile: #read from csv file data = csv.reader(csvfile, delimiter=' ') print data stemmer1 = PorterStemmer() #porterStemmer lemma = WordNetLemmatizer() #lemmatizer for row1 in data: #print (row1) row1 = row1[0].replace('\n', '') #replace occurence of new line with nochar x = stemmer1.stem(row1) y = lemma.lemmatize(row1) row1 = regexp_tokenize(row1, "[\w']+") #tokenizing z = nltk.pos_tag(row1) #pos_tagging #print row1 print z[0][0], z[0][1], x, y #,z''' wr.write(z[0][0] + " " + z[0][1] + " " + x + " " + y + '\n') #writing csvfile.close() #closing files wr.close()