def _parse(s): rx_pattern = re.compile( r""" \(CODE .*\) |\(ID .*\d\) """, re.VERBOSE | re.UNICODE, ) s = re.sub(rx_pattern, "", s) s = split(s, "\n") fullPhrase = "" # loop through the sentences and parse each sentence # every time a new sentence marker is found for sent in s: if list(tokenize.regexp(sent, r"^\(")) != []: fullPhrase = _strip_spaces(fullPhrase) if fullPhrase != "": yield fullPhrase fullPhrase = sent else: fullPhrase += sent # Get the last of the buffer and output a yield fullPhrase = _strip_spaces(fullPhrase) if fullPhrase != "": yield fullPhrase
def extractWords(text): ''' Extract all the words from a given text. @param text: input text @return: the list of all found words ''' words = list(tokenize.regexp(text.lower(), pattern)) return words
def addCounts(self, line, subject=False): #stem(x.lower()) #words = [x for x in regexp(line, WORDPUNCT)] words = [x for x in regexp(line, pattern=r'\w+', gaps=False)] for word in words: vci = self.vocab.setdefault(word, len(self.vocab)) self.counts[vci] += 1 if subject: self.subjCounts[vci] += 1
def tokenizeAndStem(self, string): """Yield a stream of downcased words from a string.""" # crude HTML comment and tag removal string = self.strip_comments.sub('', string) string = self.strip_tags.sub('', string) porter = tokenize.PorterStemmer() tokenstream = tokenize.regexp(string, self.whitespace_splitter) for token in tokenstream: token = token.lower() # ignore words with less than three letters, # stem words with more than three letters if len(token) > 2 and token not in self.stopwords: if len(token) == 3: yield token else: stemmed_token = self.cachingStemmer(token) yield stemmed_token
def tokenizeAndStem(self, string): """Yield a stream of downcased words from a string.""" # crude HTML comment and tag removal string = self.strip_comments.sub('',string) string = self.strip_tags.sub('',string) porter = tokenize.PorterStemmer() tokenstream = tokenize.regexp(string, self.whitespace_splitter) for token in tokenstream: token = token.lower() # ignore words with less than three letters, # stem words with more than three letters if len(token) > 2 and token not in self.stopwords: if len(token) == 3: yield token else: stemmed_token = self.cachingStemmer(token) yield stemmed_token
def txt2tokensOld(text): pattern = re.compile(r'''[a-zA-Z]+''', re.VERBOSE) words = list(tokenize.regexp(text.lower(), pattern)) return words
def parse(self, p_string): """ Parses a string and stores the resulting hierarchy of "domains" "hierarchies" and "tables" For the sake of NLP I've parsed the string using the nltk_lite context free grammar library. A query is a "sentence" and can either be a domain, hierarchy or a table. A domain is simply a word. A hierarchy is expressed as "domain/domain" A table is exressed as "table(sentence, sentence, sentence)" Internally the query is represented as a nltk_lite.parse.tree Process: 1. string is tokenized 2. develop a context free grammar 3. parse 4. convert to a tree representation """ self.nltktree = None # Store the query string self.string = p_string """ 1. Tokenize ------------------------------------------------------------------------ """ # Tokenize the query string, allowing only strings, parentheses, # forward slashes and commas. re_all = r'table[(]|\,|[)]|[/]|\w+' data_tokens = tokenize.regexp(self.string, re_all) """ 2. Develop a context free grammar ------------------------------------------------------------------------ """ # Develop a context free grammar # S = sentence, T = table, H = hierarchy, D = domain O, T, H, D = cfg.nonterminals('O, T, H, D') # Specify the grammar productions = ( # A sentence can be either a table, hierarchy or domain cfg.Production(O, [D]), cfg.Production(O, [H]), cfg.Production(O, [T]), # A table must be the following sequence: # "table(", sentence, comma, sentence, comma, sentence, ")" cfg.Production(T, ['table(', O, ',', O, ',', O, ')']), # A hierarchy must be the following sequence: # domain, forward slash, domain cfg.Production(H, [D, '/', D]), # domain, forward slash, another operator cfg.Production(H, [D, '/', O]) ) # Add domains to the cfg productions # A domain is a token that is entirely word chars re_domain = compile(r'^\w+$') # Try every token and add if it matches the above regular expression for tok in data_tokens: if re_domain.match(tok): prod = cfg.Production(D,[tok]), productions = productions + prod # Make a grammar out of our productions grammar = cfg.Grammar(O, productions) rd_parser = parse.RecursiveDescent(grammar) # Tokens need to be redefined. # It disappears after first use, and I don't know why. tokens = tokenize.regexp(self.string, re_all) toklist = list(tokens) """ 3. Parse using the context free grammar ------------------------------------------------------------------------ """ # Store the parsing. # Only the first one, as the grammar should be completely nonambiguous. try: self.parseList = rd_parser.get_parse_list(toklist)[0] except IndexError: print "Could not parse query." return """ 4. Refine and convert to a Tree representation ------------------------------------------------------------------------ """ # Set the nltk_lite.parse.tree tree for this query to the global sentence string = str(self.parseList) string2 = string.replace(":","").replace("')'","").replace("table(","").replace("','","").replace("'","").replace("/","") self.nltktree = parse.tree.bracket_parse(string2) # Store the resulting nltk_lite.parse.tree tree self.parseTree = QuerySentence(self.nltktree) self.xml = self.parseTree.toXML()