def _parse(s):
    rx_pattern = re.compile(
        r"""
        \(CODE .*\)
        |\(ID .*\d\)
    """,
        re.VERBOSE | re.UNICODE,
    )
    s = re.sub(rx_pattern, "", s)
    s = split(s, "\n")
    fullPhrase = ""
    # loop through the sentences and parse each sentence
    # every time a new sentence marker is found
    for sent in s:
        if list(tokenize.regexp(sent, r"^\(")) != []:
            fullPhrase = _strip_spaces(fullPhrase)
            if fullPhrase != "":
                yield fullPhrase
            fullPhrase = sent
        else:
            fullPhrase += sent

    # Get the last of the buffer and output a yield
    fullPhrase = _strip_spaces(fullPhrase)
    if fullPhrase != "":
        yield fullPhrase
Example #2
0
def extractWords(text):
    '''
        Extract all the words from a given text.
        
        @param text: input text
        @return: the list of all found words
    '''

    words = list(tokenize.regexp(text.lower(), pattern))
    return words
Example #3
0
    def addCounts(self, line, subject=False):
        #stem(x.lower())
        #words = [x for x in regexp(line, WORDPUNCT)]
        words = [x for x in regexp(line, pattern=r'\w+', gaps=False)]

        for word in words:
            vci = self.vocab.setdefault(word, len(self.vocab))

            self.counts[vci] += 1
            if subject:
                self.subjCounts[vci] += 1
Example #4
0
 def tokenizeAndStem(self, string):
     """Yield a stream of downcased words from a string."""
     # crude HTML comment and tag removal
     string = self.strip_comments.sub('', string)
     string = self.strip_tags.sub('', string)
     porter = tokenize.PorterStemmer()
     tokenstream = tokenize.regexp(string, self.whitespace_splitter)
     for token in tokenstream:
         token = token.lower()
         # ignore words with less than three letters,
         # stem words with more than three letters
         if len(token) > 2 and token not in self.stopwords:
             if len(token) == 3:
                 yield token
             else:
                 stemmed_token = self.cachingStemmer(token)
                 yield stemmed_token
Example #5
0
	def tokenizeAndStem(self, string):
		"""Yield a stream of downcased words from a string."""
		# crude HTML comment and tag removal
		string = self.strip_comments.sub('',string)
		string = self.strip_tags.sub('',string)
		porter = tokenize.PorterStemmer()
		tokenstream = tokenize.regexp(string, self.whitespace_splitter)
		for token in tokenstream:
			token = token.lower()
			# ignore words with less than three letters,
			# stem words with more than three letters
			if len(token) > 2 and token not in self.stopwords:
				if len(token) == 3:
					yield token
				else:
					stemmed_token = self.cachingStemmer(token)
					yield stemmed_token
Example #6
0
def txt2tokensOld(text):
	pattern = re.compile(r'''[a-zA-Z]+''', re.VERBOSE)
	words = list(tokenize.regexp(text.lower(), pattern))
	return words
    def parse(self, p_string):
        """
        Parses a string and stores the resulting hierarchy of "domains"
        "hierarchies" and "tables"

        For the sake of NLP I've parsed the string using the nltk_lite 
        context free grammar library.

        A query is a "sentence" and can either be a domain, hierarchy or a table.
        A domain is simply a word.
        A hierarchy is expressed as "domain/domain"
        A table is exressed as "table(sentence, sentence, sentence)"

        Internally the query is represented as a nltk_lite.parse.tree

        Process:
          1. string is tokenized
          2. develop a context free grammar
          3. parse
          4. convert to a tree representation
        """
        self.nltktree = None

        # Store the query string
        self.string = p_string

        """
        1. Tokenize
        ------------------------------------------------------------------------
        """

        # Tokenize the query string, allowing only strings, parentheses,
        # forward slashes and commas.
        re_all = r'table[(]|\,|[)]|[/]|\w+'
        data_tokens = tokenize.regexp(self.string, re_all)

        """
        2. Develop a context free grammar
        ------------------------------------------------------------------------
        """

        # Develop a context free grammar
        # S = sentence, T = table, H = hierarchy, D = domain
        O, T, H, D = cfg.nonterminals('O, T, H, D')

        # Specify the grammar
        productions = (
            # A sentence can be either a table, hierarchy or domain
            cfg.Production(O, [D]), cfg.Production(O, [H]), cfg.Production(O, [T]),
            
            # A table must be the following sequence:
            # "table(", sentence, comma, sentence, comma, sentence, ")" 
            cfg.Production(T, ['table(', O, ',', O, ',', O, ')']),

            # A hierarchy must be the following sequence:
            # domain, forward slash, domain
            cfg.Production(H, [D, '/', D]),
            # domain, forward slash, another operator
            cfg.Production(H, [D, '/', O])
        )

        # Add domains to the cfg productions
        # A domain is a token that is entirely word chars
        re_domain = compile(r'^\w+$') 
        # Try every token and add if it matches the above regular expression
        for tok in data_tokens:
            if re_domain.match(tok):
                prod = cfg.Production(D,[tok]),
                productions = productions + prod

        # Make a grammar out of our productions
        grammar = cfg.Grammar(O, productions)
        rd_parser = parse.RecursiveDescent(grammar)
       
        # Tokens need to be redefined. 
        # It disappears after first use, and I don't know why.
        tokens = tokenize.regexp(self.string, re_all)
        toklist = list(tokens)

        """
        3. Parse using the context free grammar
        ------------------------------------------------------------------------
        """
        # Store the parsing. 
        # Only the first one, as the grammar should be completely nonambiguous.
        try:
            self.parseList = rd_parser.get_parse_list(toklist)[0]
        except IndexError: 
            print "Could not parse query."
            return


        """
        4. Refine and convert to a Tree representation
        ------------------------------------------------------------------------
        """
        # Set the nltk_lite.parse.tree tree for this query to the global sentence
        string = str(self.parseList)
        string2 = string.replace(":","").replace("')'","").replace("table(","").replace("','","").replace("'","").replace("/","")
        self.nltktree = parse.tree.bracket_parse(string2)
        
        # Store the resulting nltk_lite.parse.tree tree
        self.parseTree = QuerySentence(self.nltktree)
        self.xml = self.parseTree.toXML()