Beispiel #1
0
    def Chunk(self, sentence, node='NP', grammer=r"""
                  NP: {<DT|PP\$>?<JJ>*<NN>}
                      {<NNP>+}
                      """):
        '''
        Takes text and returns a list of noune and noun phrases, this is done
        by a form RegEx matching which is included in the NLTK libary.
    
        @param text: the text that is going to be chunked
        @param node='NP': this is which node to chunk 
        @param grammer='NP: {<DT|PP\$>?<JJ>*<NN>}{<NNP>+}': the grammar ReGex to use for chunking 
        
        @return: A nested list of tuples of chunked phrases with pos tagging.
        '''

        tmp = []

        cp = RegexpParser(grammer)
             
        for sent in sentence:
            for phrase in self.sub_leaves(cp.parse(sent), node):
                tmp.append(phrase)

        results = []
        for phrase in tmp:
            string = ""
            for (word, tag) in phrase:
                string = string + word + " "
            
            results.append(string[:-1])
            
        return results
Beispiel #2
0
 def ProcessWoeds(self, arr):
     tagged = pos_tag(arr)
     chunkGram = r"""Chunk:{<RB.?>*<VB.?>*<NNP>}"""
     chunkParser = RegexpParser(chunkGram)
     chunked = chunkParser.parse(tagged)
     return chunked
     print(chunked)
Beispiel #3
0
def get_chunks(tagged_sentences):
    master_list = []
    master_noun = []
    master_adj = []
    grammar = r"""
    CHUNK1:
        {<NN.*><.*>{0,3}<JJ.*>}  # Any Noun terminated with Any Adjective
    
    CHUNK2:
        {<JJ.*><.*>{0,3}<NN.*>}  # Nouns or Adjectives, terminated with Nouns
    """
    cp = RegexpParser(grammar)
    for sent in tagged_sentences:
        tree = cp.parse(sent)
        for subtree in tree.subtrees(
                filter=lambda t: t.label() in ['CHUNK1', 'CHUNK2']):
            if (str(subtree).find('NN') > 0 or str(subtree).find('NNS') > 0
                    or str(subtree).find('NNP') > 0) and (
                        str(subtree).find('JJ') > 0
                        or str(subtree).find('JJS') > 0
                        or str(subtree).find('JJR') > 0):
                nouns = [
                    word for word, tag in subtree.leaves()
                    if tag in ['NN', 'NNS', 'NNP']
                ]
                adjss = [
                    word for word, tag in subtree.leaves()
                    if tag in ['JJ', 'JJR', 'JJS']
                ]
                master_noun.extend([nouns])
                master_adj.extend([adjss])
    return [m[0] + ":" + n[0] for m, n in zip(master_noun, master_adj)]
Beispiel #4
0
    def Chunk(self,
              sentence,
              node='NP',
              grammer=r"""
                  NP: {<DT|PP\$>?<JJ>*<NN>}
                      {<NNP>+}
                      """):
        '''
        Takes text and returns a list of noune and noun phrases, this is done
        by a form RegEx matching which is included in the NLTK libary.
    
        @param text: the text that is going to be chunked
        @param node='NP': this is which node to chunk 
        @param grammer='NP: {<DT|PP\$>?<JJ>*<NN>}{<NNP>+}': the grammar ReGex to use for chunking 
        
        @return: A nested list of tuples of chunked phrases with pos tagging.
        '''

        tmp = []

        cp = RegexpParser(grammer)

        for sent in sentence:
            for phrase in self.sub_leaves(cp.parse(sent), node):
                tmp.append(phrase)

        results = []
        for phrase in tmp:
            string = ""
            for (word, tag) in phrase:
                string = string + word + " "

            results.append(string[:-1])

        return results
Beispiel #5
0
def parse_request(message):
    tagPatterns = [
        (r'(honda|toyota|ford|kia|hyundai|audi|bmw|opel|mitsubishi|mazda|skoda|skoda|subaru)$',
         'VENDOR'),
        (r'([a-zA-Z0-9]+)$', 'MODEL'),
        (r'(от|для)$', 'PREP'),
        (r'(нах|бля|твою мать)$', 'PROFANITY'),
        (r'([а-яА-Я]+)$', 'PART_NAME'),
    ]

    tagger = nltk.RegexpTagger(tagPatterns)
    taggedRequest = tagger.tag(nltk.word_tokenize(message))

    chunker = RegexpParser(r'''
	    S: {<CAR> <PREP>? <PART_NAME>}
	    MODEL: {<MODEL>+}
	    VENDOR: {<VENDOR>}
	    CAR: {<VENDOR> <MODEL>}
	    PROFANITY: {<PROFANITY>+}
	    PART_NAME: {<PART_NAME>+}
	''')

    tree = chunker.parse(taggedRequest)

    car = list(tree.subtrees(lambda t: t.label() == 'VENDOR'))
    parsed_request = {}

    # Hack with try except
    try:
        parsed_request['vendor'] = list(
            tree.subtrees(lambda t: t.label() == 'VENDOR'))[0].leaves()[0][0]
    except Exception:
        parsed_request['vendor'] = None
    try:
        parsed_request['model'] = ' '.join([
            leave[0] for leave in list(
                tree.subtrees(lambda t: t.label() == 'MODEL'))[0].leaves()
        ])
    except Exception:
        parsed_request['model'] = None
    try:
        parsed_request['part_name'] = ' '.join([
            leave[0] for leave in list(
                tree.subtrees(lambda t: t.label() == 'PART_NAME'))[0].leaves()
        ])
    except Exception:
        parsed_request['part_name'] = None
    try:
        if len(list(tree.subtrees(lambda t: t.label() == 'PROFANITY'))):
            parsed_request['profanity'] = True
        else:
            parsed_request['profanity'] = False
    except Exception:
        parsed_request['profanity'] = False

    return parsed_request
Beispiel #6
0
 def __init__(self,
              patterns: str,
              loop: int = 1,
              trace: int = 0,
              attribute: str = 'pos',
              apply_iob2: bool = True) -> None:
     self.__attribute = attribute
     self.__regex_parser = RegexpParser(patterns,
                                        root_label='',
                                        loop=loop,
                                        trace=trace)
     self.__apply_iob2 = apply_iob2
Beispiel #7
0
def generate_chunks(tagged_sent, expression=r'CHUNK: {(<adj>* <n.*>+ <prp>)? <adj>* <n.*>+}'):
    chunks = []
    chunkParser = RegexpParser(expression)
    try:
        if len(tagged_sent) == 0:
            tree = Tree('S', [])
        else:
            tree = chunkParser.parse(tagged_sent, trace=0)
        for subtree in tree.subtrees():
            if subtree.label() == "CHUNK":
                chunks.append(subtree.leaves())
    except ValueError:
        chunks = []
    return chunks
Beispiel #8
0
class RegexpChunker(Chunker):
	
	"""
		Este tagger de n-gramas o chunker utiliza gramaticas para detectar frases.
		setupData: es el string de las gramaticas
		
	"""

	
	def __init__(self,setupData):
		super(RegexpChunker,self).__init__(setupData)
		self.chunker=RegexpParser(setupData)


	def tag(self,data): 

	  	if self.fixer_function:
	  		data=self.fixer_function(data)
		iobs=None
		try:
			parsedTree=self.chunker.parse(data)
			print parsedTree
			iobs= tree2conlltags(parsedTree)
		
		except Exception,e:
			pass
		return iobs	
Beispiel #9
0
    def _chunker(self, tuple_sent):
        """Chunk base-phrases using chunking rules.

        Args:
            tuple_sent (list(tuple(str, str)))

        Returns:
            chunk_struct Tree('S', [Tree('CHUNK', [(str, str), (str, str)]], (str, str), ...): chunked sentence
        """
        chunkTreeList = []

        chunker = RegexpParser(self._ChunkingRule(self._CHUNK_RULE_VXP_))

        chunk_struct = chunker.parse(tuple_sent)

        return chunk_struct
Beispiel #10
0
class RegexpChunker(Chunker):
	
	"""
		Este tagger de n-gramas o chunker utiliza gramaticas para detectar frases.
		setupData: es el string de las gramaticas
		
	"""

	
	def __init__(self,setupData):
		super(RegexpChunker,self).__init__(setupData)
		self.chunker=RegexpParser(setupData)


	def tag(self,data): 

	  	if self.fixer_function:
	  		data=self.fixer_function(data)
		iobs=None
		try:
			parsedTree=self.chunker.parse(data)
			iobs= tree2conlltags(parsedTree)
		
		except Exception,e:
			pass
		return iobs	
Beispiel #11
0
def find_keywords(text):
  """
    Extracts keywords from text.

    Args:
      text: A text fragment.

    Returns:
      A list containing the extracted keywords.
  """
  grammar = r'''
    KEYWORD: {<NNP><NNP>+}
        {<NN.*><NN.*>+}
        {<JJ>+<NN>+}
  '''
  parser = RegexpParser(grammar)
  sentences = [ ]
  words = [ ]
  keywords = [ ]
  for sentence in sent_tokenize(text):
    tokens = word_tokenize(sentence)
    if not tokens: continue
    sentences.append(tokens)
    words += tokens

  collocations = find_collocations(words)

  for sentence in sentences:
    tree = parser.parse(pos_tag(sentence))
    for node in _select_nodes(tree, ['KEYWORD']):
      word = ' '.join(map(lambda p: p[0], node))
      if word in collocations:
        keywords.append(word)

  keywords = sorted(keywords, key=lambda k: len(k.split()), reverse=True) 
  instances = { }
  for k in keywords:
    key = k
    for existing in instances.keys():
      if re.match(k, existing):
        key = existing
        break
    instances[key] = instances.get(key, 0) + 1 
  results = instances.items()
  results.sort(key=lambda item: int(item[1]), reverse=True)
  return map(lambda item: item[0], results)
Beispiel #12
0
def rule_based_reqs_chunk(tagged_reqs, ids):
    chunker = RegexpParser(ruleset)
    terms = []
    term_index = []
    for i, t in enumerate(tagged_reqs):
        s = chunker.parse(t)
        for c in s:
            if not isinstance(c, tuple):
                if c.label() == 'NP':
                    term = []
                    for tagged_word in c:
                        if (tagged_word[1] != 'DT') and (tagged_word[1] !=
                                                         'PRP$'):
                            term = term + [tagged_word[0]]
                    terms.append(term)
                    term_index.append(i)
    return terms, term_index
Beispiel #13
0
	def parse(self):
		"""
			Parse le texte tokenisé à l'aide de notre grammaire créé pour récupérer les groupes de mots 
			contenant une NE.
		"""
		if self.own_tag:
			rp = RegexpParser(Parser.GRAMMAR_OWN_TAG)
		else:
			rp = RegexpParser(Parser.GRAMMAR)
		tree = rp.parse(self.tokens)

		for subtree in tree.subtrees():
			if subtree.label() == "S":
				continue
			self.tagged_nodes.append(
				[subtree.label(), subtree.leaves()]
			)
		print(self.tagged_nodes)
Beispiel #14
0
def additionalExtractions(dep_triples, tagged_sentence, svo_triples):
    if not svo_triples:
        return None
    grammar = "SmallNP: {(<CD.*>|<JJ.*>)<NN.*>+}"
    cp = RegexpParser(grammar)
    chunk = cp.parse(tagged_sentence)
    triple_array = []
    for subtree in chunk.subtrees():
        if subtree.label() == 'SmallNP':
            for triple in svo_triples:
                pos = subtree.leaves()
                loc1 = tag_index(pos, triple[0])
                if loc1 != -1:
                    triple_array.extend(chunk_triples(pos, loc1))
                loc2 = tag_index(pos, triple[2])
                if loc2 != -1:
                    triple_array.extend(chunk_triples(pos, loc2))
    return triple_array
Beispiel #15
0
def extractPossibleTerms(root, fileids):
    # get corpus
    #root, filename = os.path.split(path)
    reader = PlaintextCorpusReader(root, fileids)
    # get chunker
    grammar = 'NP: {<JJ>*<NNP>*<NN>*}'
    chunker = RegexpParser(grammar)
    # get terms
    terms = set()
    print len(reader.sents())
    i = 0
    for sent in reader.sents():
        i += 1
        if i%100==0:
            print i
        tree = chunker.parse(pos_tag(sent))
        for t in tree.subtrees(lambda t: t.node!='S'): # exclude Sentence node
            terms.add(' '.join([el[0] for el in t]))
    return terms
Beispiel #16
0
	def preprocessing(self,desc):
		desc = desc.replace(","," ")
		desc = desc.replace("!","")
		desc = desc.replace("@","")
		desc = desc.replace("#","")
		desc = desc.replace("%","")
		desc = desc.replace("(","")
		desc = desc.replace(")","")
		desc = desc.replace(":","")
		desc = desc.replace("{","")
		desc = desc.replace("}","")
		desc = desc.replace("`","")
		desc = desc.replace("[","")
		desc = desc.replace("]","")
		desc = desc.replace("'","")
		desc = desc.replace("*","")
		desc = desc.replace("&","")
		desc = desc.replace("^","")
		print desc
		if "I/O" in desc:
			desc = desc.replace("I/O","IO")
		desc = desc.replace("/"," and ")
		tokenized = nltk.word_tokenize(desc)
		posTag = nltk.pos_tag(tokenized)
		grammar = '''
		RB: {<RB> | <RBS> | <RBR>}'''
		chunker = RegexpParser(grammar)
		chunked = chunker.parse(posTag)
		print chunked

		for n in range(len(chunked)):
			if str(chunked[n]).startswith('(RB') is True:
				if n is 0 :
					s = str(chunked[n]).split(" ")
					ss = s[1].split("/")
					removalWord = ss[0]
					desc = desc.replace(removalWord+" ","")
				if n>0 and n<=len :
					s = str(chunked[n]).split(" ")
					ss = s[1].split("/")
					removalWord = ss[0]
					desc = desc.replace(" "+removalWord,"")
		return desc
Beispiel #17
0
    def exctract_ngrams(self, tagged_sent):
	'''
	Exctract ngrams, given a list of chunk rules for the previously tagged sentence.

	Keyword arguments:
	@param tagged_sent the POST tagged sentence whose ngrams need to be exctracted
	'''

        chunker = RegexpParser(CHUNK_RULE)
        tree = chunker.parse(tagged_sent)
        ngrams = []
        for item in self.__leaves(tree):
            if not item == tagged_sent:
                probable_ngram = ' '.join(self.__stemmer.stem(
                    word.lower()) for (word, pos) in item
                )
                if self.__evaluate_polarity_ngram(probable_ngram):
                    ngrams.append(probable_ngram)
        return ngrams
def chunking_noun(document):
    #Get the words in the document
    words = word_tokenize(document)
    tagged = nltk.pos_tag(words)
    counts = Counter(tag for WORD, tag in tagged)
    counts = dict(counts)
    #print(counts)
    chunkGram = r""" PHRASE: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}"""
    chunkParser = RegexpParser(chunkGram)
    chunked = chunkParser.parse(tagged)
    serch_keywords = []
    for tree in chunked.subtrees():
        if tree.label() == 'PHRASE':
            serch_keyword = ' '.join([x for x, y in tree.leaves()])
            serch_keywords.append(serch_keyword)
    serch_keywords = [
        w for w in serch_keywords
        if len(w.split(' ')) > 1 and len(w.split(' ')) <= 3
    ]
    return serch_keywords, tagged, counts
Beispiel #19
0
def get_noun_phrases(text_list, tagger):
    noun_phrases = []
    tagged_texts = [tagger.tag(text.split()) for text in text_list]

    expression = r'NOUN_PHRASE: {(<adj>* <n.*>+ <prp>)? <adj>* <n.*>+}'

    chunkParser = RegexpParser(expression)

    for tagged_sent in tagged_texts:
        try:
            if len(tagged_sent) == 0:
                tree = Tree('S', [])
            else:
                tree = chunkParser.parse(tagged_sent, trace=0)
            for subtree in tree.subtrees():
                if subtree.label() == "NOUN_PHRASE":
                    noun_phrases.append([el[0] for el in subtree.leaves()])
        except ValueError:
            noun_phrases = []
    return noun_phrases
Beispiel #20
0
def extract_candidate_phrases(document_obj, parts_of_speech_re=DEFAULT_RE):
    '''
    :param document_obj: document from which you want to extract parts of the speech (candidate phrases)
    :param parts_of_speech_re: regular expression with parts of speech structure
    :return: dict, keys are the sentence id and values list of candidate phrases for that sentence
    '''
    candidate_phrases = {}

    # get sentences of the document
    sentences = document_obj.get_sentences()

    # for each sentence
    for sentence in sentences:
        sentence_id = sentence.get_sentence_id()

        # get tokens
        tokens_objs = sentence.get_tokens()

        # list of tuples with token and its pos
        token_pos_list = [(token_obj.get_token_str(),
                           token_obj.get_token_pos())
                          for token_obj in tokens_objs]

        # create regex parser with regular expression of tags
        regex_parser = RegexpParser(parts_of_speech_re)
        sentence_regex_tree = regex_parser.parse(token_pos_list)

        # get all subtrees with NP label
        match_subtrees = sentence_regex_tree.subtrees(
            filter=lambda t: t.label() == STAGE_MARKER)
        sentence_candidate_phrases = []

        # add candidate phrases
        for subtree in match_subtrees:
            leaves_str = ' '.join(
                [leave_token_pos[0] for leave_token_pos in subtree.leaves()])
            sentence_candidate_phrases.append(leaves_str)

        candidate_phrases[sentence_id] = sentence_candidate_phrases

    return candidate_phrases
Beispiel #21
0
def get_search_tags(a, verbose=False):
    if verbose:
        print()
        print('-' * 100)
        print("\tRunning `get_search_tags`...")
        print('-' * 100)

    search_tag_parser = RegexpParser("STAG: {\
        (<RB>|<RBR>|<RBS>|<VB>|<VB[A-Z]>|<IN>|<CC>)\
        (<JJ>|<JJR>|<JJS>|<DT>)\
        (<NN>|<NNS>|<NNP>|<NNPS>)+\
        }")

    pos_tags = pos_tag(word_tokenize(a))
    if verbose:
        print("Part of Speech Tags:", pos_tags, '\n')

    data = search_tag_parser.parse(pos_tags)
    if verbose:
        print("Matched Search Tags:", data)

    return extract_tags(data)
def extract_candidate_keywords(document):

    #Get the words in the document
    words = word_tokenize(document)

    # Chunk first to get 'Candidate Keywords'
    tagged = nltk.pos_tag(words)
    chunkGram = r""" PHRASE: 
                        {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}
                """

    chunkParser = RegexpParser(chunkGram)
    chunked = chunkParser.parse(tagged)

    candidate_keywords = []
    for tree in chunked.subtrees():
        if tree.label() == 'PHRASE':
            candidate_keyword = ' '.join([x for x,y in tree.leaves()])
            candidate_keywords.append(candidate_keyword)

    candidate_keywords = [w for w in candidate_keywords if len(w) > 3 and  len(w.split(' ')) < 6]
    #print("Data XYZ:",candidate_keywords) 
    return candidate_keywords
Beispiel #23
0
class NLTKChunker(PackProcessor):
    r"""A wrapper of NLTK chunker.
    """
    def __init__(self):
        super().__init__()
        self.chunker = None

    # pylint: disable=unused-argument
    def initialize(self, resources: Resources, configs: Config):
        super().initialize(resources, configs)
        self.chunker = RegexpParser(configs.pattern)

    @classmethod
    def default_configs(cls):
        r"""This defines a basic config structure for NLTKChunker.
        """
        config = super().default_configs()
        config.update({
            'pattern': 'NP: {<DT>?<JJ>*<NN>}',
            'token_component': None,
            'sentence_component': None
        })
        return config

    def _process(self, input_pack: DataPack):
        for sentence in input_pack.get(
                Sentence, components=self.configs.sentence_component):
            token_entries = list(
                input_pack.get(entry_type=Token,
                               range_annotation=sentence,
                               components=self.configs.token_component))

            tokens = [(token.text, token.pos) for token in token_entries]
            cs = self.chunker.parse(tokens)

            index = 0
            for chunk in cs:
                if hasattr(chunk, 'label'):
                    # For example:
                    # chunk: Tree('NP', [('This', 'DT'), ('tool', 'NN')])
                    begin_pos = token_entries[index].span.begin
                    end_pos = token_entries[index + len(chunk) - 1].span.end
                    phrase = Phrase(input_pack, begin_pos, end_pos)
                    phrase.phrase_type = chunk.label()

                    index += len(chunk)
                else:
                    # For example:
                    # chunk: ('is', 'VBZ')
                    index += 1
Beispiel #24
0
class NLTKChunker(PackProcessor):
    r"""A wrapper of NLTK chunker.
    """
    def __init__(self):
        super().__init__()
        self.chunker = None
        self.token_component = None

    # pylint: disable=unused-argument
    def initialize(self, resource: Resources, configs: HParams):
        self.chunker = RegexpParser(configs.pattern)

    @staticmethod
    def default_configs():
        r"""This defines a basic config structure for NLTKChunker.
        """
        return {
            'pattern': 'NP: {<DT>?<JJ>*<NN>}',
        }

    def _process(self, input_pack: DataPack):
        for sentence in input_pack.get(Sentence):
            token_entries = list(
                input_pack.get(entry_type=Token,
                               range_annotation=sentence,
                               component=self.token_component))
            tokens = [(token.text, token.pos) for token in token_entries]
            cs = self.chunker.parse(tokens)

            index = 0
            for chunk in cs:
                if hasattr(chunk, 'label'):
                    # For example:
                    # chunk: Tree('NP', [('This', 'DT'), ('tool', 'NN')])
                    begin_pos = token_entries[index].span.begin
                    end_pos = token_entries[index + len(chunk) - 1].span.end
                    phrase = Phrase(input_pack, begin_pos, end_pos)
                    kwargs_i = {"phrase_type": chunk.label()}
                    phrase.set_fields(**kwargs_i)
                    input_pack.add_or_get_entry(phrase)
                    index += len(chunk)
                else:
                    # For example:
                    # chunk: ('is', 'VBZ')
                    index += 1
Beispiel #25
0
class TreeChunker(ContextChunker):
    def __init__(self,
                 patterns: str,
                 loop: int = 1,
                 trace: int = 0,
                 attribute: str = 'pos',
                 apply_iob2: bool = True) -> None:
        self.__attribute = attribute
        self.__regex_parser = RegexpParser(patterns,
                                           root_label='',
                                           loop=loop,
                                           trace=trace)
        self.__apply_iob2 = apply_iob2

    def tag(self, context: Context) -> List[str]:
        tokens_to_chunk = [
            'NULL' if tk == '' else tk for tk in context.get(self.__attribute)
        ]

        chunk_struct = list(zip(context.get('tokens'), tokens_to_chunk))

        return self._traverse_tree(self.__regex_parser.parse(chunk_struct))

    def _traverse_tree(self, tree, is_subtree: bool = False):
        tags = []
        for i, subtree in enumerate(tree):
            if isinstance(subtree, nltk.tree.Tree):
                tags.extend(self._traverse_tree(subtree, True))
            else:
                tag = tree.label()
                if is_subtree:
                    index = ''
                    if self.__apply_iob2:
                        index = 'B-' if i == 0 else 'I-'

                    tag = f'{index}{tag}'

                tags.append(tag)

        return tags
Beispiel #26
0
class PostPatternStrategy(Strategy):
    """
		Hay casos en que las frases que deseamos detectar se basan en las palabras mas que en su clase gramatical. Tambien podemos 
		ser mas precisos si podemos considerar distintos niveles del arbol, por ejemplo frases y palabras juntas dentro de una regla como un unico token.
		Esta estrategia permite mirar el arbol en altura y ancho, de modo que las gramaticas que escribamos podran ser mas presicas y flexibles.


	"""
    def __init__(self, grammar="", loop=2):
        super(PostPatternStrategy, self).__init__()
        self.postChunker = RegexpParser(grammar, loop)
        self.grammar = grammar
        self.loop = loop

    def fix(self, feature):

        cleanSentence = feature
        tree = None
        try:

            grammar_pattern_to_clean = r'_.*'  # caracter de separacion de niveles dentro de un mismo token.
            clean_pattern = ''
            modified_chunk_pattern = r'.*_'
            words, post, iobs = zip(*feature)
            wiobs = tuple(
                w + "_" + iob for w, iob in zip(words, iobs)
            )  # las sentencias a parsear ahora no consideran el POS TAG, sino IOBS y palabras.
            sentence = zip(words, wiobs)
            tree = self.postChunker.parse(sentence)
            loc_tags = tree2conlltags(flatten_deeptree(
                tree))  # voy de arbol a lista de tuplas de nuevo.
            cleanSentence = cleanIobs(words, post, loc_tags,
                                      grammar_pattern_to_clean,
                                      modified_chunk_pattern, clean_pattern)

        except Exception, e:
            pass

        return cleanSentence
Beispiel #27
0
class PostPatternStrategy(Strategy):

	"""
		Hay casos en que las frases que deseamos detectar se basan en las palabras mas que en su clase gramatical. Tambien podemos 
		ser mas precisos si podemos considerar distintos niveles del arbol, por ejemplo frases y palabras juntas dentro de una regla como un unico token.
		Esta estrategia permite mirar el arbol en altura y ancho, de modo que las gramaticas que escribamos podran ser mas presicas y flexibles.


	"""
	
	def __init__(self,grammar="",loop=2):
		super(PostPatternStrategy,self).__init__()		
		self.postChunker=RegexpParser(grammar,loop)
		self.grammar=grammar
		self.loop=loop

	def fix(self, feature):
		
		cleanSentence=feature
		tree=None
		try:
			
			grammar_pattern_to_clean=r'_.*' # caracter de separacion de niveles dentro de un mismo token.
			clean_pattern=''
			modified_chunk_pattern=r'.*_'
			words,post,iobs=zip(*feature)
			wiobs=tuple(w+"_"+iob for w,iob in zip(words,iobs)) # las sentencias a parsear ahora no consideran el POS TAG, sino IOBS y palabras.
			sentence=zip(words,wiobs)
			tree=self.postChunker.parse(sentence)
		  	loc_tags=tree2conlltags(flatten_deeptree(tree)) # voy de arbol a lista de tuplas de nuevo.
			cleanSentence=cleanIobs(words,post,loc_tags,grammar_pattern_to_clean,modified_chunk_pattern,clean_pattern)
	  		

		except Exception,e:
			pass

	  	return cleanSentence
train_data = data[:4000]
test_data = data[4000:]
print train_data[7]

simple_sentence = 'the quick fox jumped over the lazy dog'

from nltk.chunk import RegexpParser
from pattern.en import tag

tagged_simple_sent = tag(simple_sentence)
print tagged_simple_sent

chunk_grammar = """
NP: {<DT>?<JJ>*<NN.*>}
"""
rc = RegexpParser(chunk_grammar)
c = rc.parse(tagged_simple_sent)
print c

chink_grammar = """
NP: {<.*>+} # chunk everything as NP
}<VBD|IN>+{
"""
rc = RegexpParser(chink_grammar)
c = rc.parse(tagged_simple_sent)
print c

tagged_sentence = tag(sentence)
print tagged_sentence

grammar = """
train_data = data[:4000]
test_data = data[4000:]
print train_data[7]

simple_sentence = 'the quick fox jumped over the lazy dog'

from nltk.chunk import RegexpParser
from pattern.en import tag

tagged_simple_sent = tag(simple_sentence)
print tagged_simple_sent

chunk_grammar = """
NP: {<DT>?<JJ>*<NN.*>}
"""
rc = RegexpParser(chunk_grammar)
c = rc.parse(tagged_simple_sent)
print c

chink_grammar = """
NP: {<.*>+} # chunk everything as NP
}<VBD|IN>+{
"""
rc = RegexpParser(chink_grammar)
c = rc.parse(tagged_simple_sent)
print c

tagged_sentence = tag(sentence)
print tagged_sentence

grammar = """
Beispiel #30
0
def GetPatternsTree(tagsList, pattern, patternName):
	gramaticalAnalyse = RegexpParser(pattern)
	tree = gramaticalAnalyse.parse(tagsList)
	patt = ExtractPhrases(tree, patternName)
	return patt
Beispiel #31
0
from tagged_article import TaggedArticle

def sanitizeTags(taggedList):
    sanitizedList = []
    for key, value in taggedList:
        if not value:
            value = 'NNP'
        sanitizedList.append((key, value))
    return sanitizedList

# Open the redis interface
redisInterface = RedisInterface()

# Prepare the chunker
chunker = RegexpParser(r'''
        Nouns:
            {<JJ.*>*<NN.*>*}
        ''');

# Print status
print 'Analyzer ONLINE'

# Run the wait-execute loop
articleNumber = 0
while True:

    while not redisInterface.hasArticleData(articleNumber, 'article_data'):
        sleep(1)

    # Retreive the tagged data from redis
    taggedArticleObject = redisInterface.getArticleData(articleNumber, 'article_data')
Beispiel #32
0
	def __init__(self,setupData):
		super(RegexpChunker,self).__init__(setupData)
		self.chunker=RegexpParser(setupData)
Beispiel #33
0
from nltk import Tree, RegexpChunkParser
from nltk.chunk import RegexpParser
from nltk.chunk.regexp import ChunkString, ChunkRule, ChinkRule

s = [('the', 'DT'), ('book', 'NN'), ('has', 'VBZ'), ('many', 'JJ'), ('chapters', 'NNS')]
# forth
chunker = RegexpParser(r'''
NP:
    {<DT><NN.*><.*>*<NN.*>}
    }<VB.*>{'''
)

print(chunker.parse(s))

# back
t = Tree('S', s)
cs = ChunkString(t)
print(cs)

ur = ChunkRule('<DT><NN.*><.*>*<NN.*>', 'chunk determiners and nouns')
ur.apply(cs)
print(cs)

ir = ChinkRule('<VB.*>', 'chink verbs')
ir.apply(cs)
print(cs)

print(cs.to_chunkstruct())
# cs.to_chunkstruct().draw()

chunker = RegexpChunkParser([ur, ir])
Beispiel #34
0
	def chunk(self, posTaggedQuote):
		'''Holds the chunkers used by the condensed class'''
		quoteItemCondensedList = [] 								   #Need to zero this our for testing, might take away later
		EMPChunker = RegexpParser(r"""
		EMP:                                                           #Emotion Phrase
			{<MD>(<VBP>|<VB>|<VBZ>|<VBD>)<JJ><,><CC>}                  #Modular, verb, anything, adjective, comma, conjunction
			{<MD>(<VBP>|<VB>|<VBZ>|<VBD>)<JJ><CC>}                     #Modular, verb, anything, adjective, conjunction
			{(<VBP>|<VB>|<VBZ>|<VBD>)<JJ><,><CC>}                      #Verb, anything, adjective, comma, conjunction
		    {(<VBP>|<VB>|<VBZ>|<VBD>)<JJ><CC>}                         #Verb, anything, adjective, conjunction

		    {(<VBP>|<VB>|<VBZ>|<VBD>)<JJ><CC>}	                       #Verb, anything, adjective, conjunction

			{(<VBP>|<VB>|<VBZ>|<VBD>)<RB><JJ>}                         #Verb, adverb, adjective
		   	{<MD><RB>(<VBP>|<VB>|<VBZ>|<VBD>)<JJ>}                     #Modular, adverb, verb, adjective
			{<RB>(<VBP>|<VB>|<VBZ>|<VBD>)<JJ>}                         #Adverb, verb, adjective

			{<MD>(<VBP>|<VB>|<VBZ>|<VBD>)<JJ>}                         #Modular, verb, anything, adjective
			{(<VBP>|<VB>|<VBZ>|<VBD>)<TO>(<VBP>|<VB>|<VBZ>|<VBD>)<JJ>} #Verb, "to", verb anything, adjective
			{(<VBP>|<VB>|<VBZ>|<VBD>)<JJ>}                             #Verb, anything, adjective

		""")
		PRPHChunker = RegexpParser(r"""
		PRPH:                                                          #Preposition Phrase
			{<.*>*<PRP><.*>*<EMP>}				                       #Anything, proposition, anything
			{<EMP><.*>*<PRP><.*>*}				                       #Anything, proposition, anything
			}<EMP>{													   #Chink at the EMP chunk, recursion!
		""")

		#This is going to have to be recursive, to chunk the entire phrase
		#This section chunkes, and condenses, the EMP chunk becomes "EMP"
		#Then sets the happy level of the condesned quoteItem
		EMPChunked = EMPChunker.parse(posTaggedQuote)
		for piece in EMPChunked:
			if type(piece) != tuple:
				#self.quoteItemCondensedList.append((piece, 'EMP'))			#TESTING
				self.quoteItemCondensedList.append(('','EMP'))			#TESTING
			else:
				self.quoteItemCondensedList.append(piece)
		self.printCondensed()


		#Simulating the recursion, PRP chunk next
		#Want to chunk everything seperately, then figure out the best recursive algorithm
		newQuoteItemCondensedList = self.quoteItemCondensedList
		self.quoteItemCondensedList = []									#Clear the list to condense more
		PRPHChunked = PRPHChunker.parse(newQuoteItemCondensedList)
		for piece in PRPHChunked:
			if type(piece) != tuple:
				#self.quoteItemCondensedList.append((piece, 'PRPH'))		#TESTING
				self.quoteItemCondensedList.append(('','PRPH'))				#TESTING
			else:
				self.quoteItemCondensedList.append(piece)
		self.printCondensed()

		newQuoteItemCondensedList = self.quoteItemCondensedList
		self.quoteItemCondensedList = []									#Clear the list to condense more
		PRPHChunked = PRPHChunker.parse(newQuoteItemCondensedList)
		for piece in PRPHChunked:
			if type(piece) != tuple:
				#self.quoteItemCondensedList.append((piece, 'PRPH'))		#TESTING
				self.quoteItemCondensedList.append(('','PRPH'))				#TESTING
			else:
				self.quoteItemCondensedList.append(piece)
		self.printCondensed()
Beispiel #35
0
    def __init__(self):
        grammar = r'''
            R-DATE: {<IN><CD><TO><CD>}
            R-DATE: {<IN><CD><IN><CD>}
            R-DATE: {<JJ><CD><CC><CD>}
            FULL-DATE: {<IN><CD><NNP><CD>}
            FULL-DATE: <VB.*>{<CD><NNP><CD>}
            MONTH-DATE: {(<IN|DT>)?<NNP><CD>}
            NP: {<JJR><IN><CD><NNS>}
            NP: {<IN><CD><NNS>}
            NP: {<CD><IN><DT><CD><NNS>(<JJ>)?}
            DM_DATE: {<IN><CD><NNP>}(<,>|<NN.*>)
            DATE: {<IN>(<DT>)?<CD>}
            DT-DATE: {<DT><CD>}
            POS-DATE: <POS>{<CD>}
            V-DATE: {<IN|CD><JJ><CD>}
            DATE: (<,>)?{<CD>}<,>
            N-DATE: (<,>)?{((<.*DATE><,>)+)?<CD><CC><CD>}

            NN-LST: {<NN.*>(<,><NN.*>)+(<,>)?<CC><NN.*>}
            NP: {(<RP|IN|NN.*|.*DT|RB|JJ.*|RB.*|POS|``|"|''|FW|POS-DATE|CD|TO|WRB>)*<NN.*>(<TO>(<DT>)?<NN.*>)?(<RB>)?(<IN>)?(<JJ|RB|CD|DT|POS>)*}
            NP: {<P-DATE><NP>}
            NP: {<NP><NP>}
            NP: {<NP><,><NP><,>}
            CC-NP: {<NP>(<CC><NP>)+}

            PP: {((<PDT>)?<DT>)?(<RB|IN|WRB|WDT|TO|JJ|PRP>)*<PRP.*>(<MD>)?}
            PP: {<WP|WRB>}
            PP: {<IN><WDT>(<DT|RBR>)*}
            PP: <,>{<DT><JJ>}

            NP: {<NP><PP><NP>}
            P-NP: {<PP><NP>(<,><NP><,>)?}
            C-PP: {(<CD><PP>|<PP><CD>)}
            CC-P-NP: {<P-NP|PP><CC><NP>}
            NP: {<NP><,>((<,|CC>)*<.*NP>)*<,>}

            VP: {<VB.*><IN><TO><DT><VB.*>}
            VP: {<VB.*><RP>}
            VP: {(<IN|TO|VB.*|.*DT|RB|JJ|EX|MD>)*<VB.*>(<JJ>)?(<RB>(<TO|JJ|>)?)?}
            VP: {<IN><DT><VB.*>(<RB><TO>)?}
            VP: {<RB|VB.*|MD|TO>*<VB.*><RB|VB.*|MD|TO>*}
            VP: {<VP><IN>}
            VP: {<IN><VP>(<RP>)?<TO>}
            VP: {((<DT>)?<IN>)?<WDT><VP>}
            VP: {<IN><DT-DATE><VP>}
            Y-DATE: <JJ>{<CD>}
            VP: {<JJ>}<Y-DATE>
            CC-VP: {<VP><NP><CC><VP><NP>}

            CC-NP: <VP>{<NP>(<,><NP>)*<CC><NP>}
            D-NP : <VP>{<.*DATE><.*NP>}

            CLAUSE-P: <,|CC>{<VP><P-NP>}(<,>|<CC>|<.*DATE>)
            CLAUSE-NS: <,>(<CC>)?{(<VP><.*NP>)+}<,>
            CLAUSE-NS: <CC>{(<VP><.*NP>)+}
            CLAUSE: {<NP>(<VP><.*NP>|<CC-VP>)+(.*P-NP)?}
            CLAUSE-P: {<PP|P-NP>(<VP><.*NP>|<CC-VP>)+}
            CLAUSE-P: <,>{<PP|P-NP><VP>}<,>
            CLAUSE-P: <,>{<PP|P-NP><VP><CLAUSE>}
            CLAUSE: <CC>{<NP><VP><CLAUSE-P>}
            CLAUSE-NS: <,>{<VP><.*NP>}
            CLAUSE-OSL: <CLAUSE-P><CC><,>{<NP>}<,>
            CLAUSE-OSR: <,>{<NP>}<CLAUSE-P>
            CLAUSE: {<NP><CLAUSE-P>}

            D-CLAUSE-P: {<CLAUSE-P><.*DATE>}
            D-CLAUSE-P: <,>{<DATE><CLAUSE-P>}<,>
            D-CLAUSE-P: <,>{<CLAUSE-P><,><VP><.*DATE>}
            D-CLAUSE: {<CLAUSE><.*DATE>}
            D-CLAUSE: {<.*DATE><,><CLAUSE>}<,>
            CLAUSE-NS: {<VP><.*NP>}
            D-CLAUSE-NS: {<CLAUSE-NS><.*DATE>}
            D-CLAUSE-NS: {<VP><NP><.*DATE>}<,>
            D-CLAUSE-NS: <CC>{<.*DATE>(<,>)?<CLAUSE-NS>}
            D-CLAUSE-P: {<P-NP><VP><.*DATE>}


            D-CLAUSE-M-P: {<.*DATE><,><CLAUSE-P>((<,|CC>)+<CLAUSE-P>)+}
            D-CLAUSE-M: {<.*DATE><,><CLAUSE-P>(<,>(<CC>)?<CLAUSE-NS>)+}
            D-CC-CLAUSE: {<.*DATE><CLAUSE><,><CC><CLAUSE>}
            D-CLAUSE: {<.*NP><.*VP><.*DATE>}
            D-CLAUSE: <,>{<.*DATE><.*CLAUSE.*>}
            D-CLAUSE-P: {<CLAUSE-P>(<,>)?(<.*NP>)?<.*DATE>}
            D-CLAUSE-P-L: <D-CLAUSE-P>(<,|CC>)+{<NP>(<,><NP>)*<.*DATE>}
            D-CLAUSE-P: {<.*DATE><,><CLAUSE-P>}
            D-CLAUSE-NS: <.*CLAUSE.*>(<,|CC>)*{<.*DATE>(<,>)?<CLAUSE-NS>}
            DD-CLAUSE: {<D-CLAUSE.*>(<,|CC>)+(<RB>)?<.*DATE>}
            D-CLAUSE-P: {<.*DATE><CLAUSE-P>}(<,>)?
            D-CLAUSE-P: (<,>)?{<CLAUSE-P><CC><D-CLAUSE-NS>}
             '''
        self.chunker = RegexpParser(grammar, loop=1)
        self.exclude = {s for s in string.punctuation if s not in [';', ':', '&', ',', ]}
        self.exclude.add('``')
        self.exclude.add("''")
def GetPatternsTree(tagsList, pattern, patternName):
    gramaticalAnalyse = RegexpParser(pattern)
    tree = gramaticalAnalyse.parse(tagsList)
    patt = ExtractPhrases(tree, patternName)
    return patt
from util import sub_leaves

SINGLE_WORD_FREQ_CUT_OFF = 6
PATTERNS = r'''
    NP:  {<CD|VBN>?<NN.*|JJ.*>*<CD>?<NN.*|VBG><CD>?}
'''
PATTERNS_X = r'''
    NP:  {<NN.*|JJ.*|CD>*<NN.*|VBG><CD>?}
         {<NN.*|JJ.*>*<CD>?<NN.*|VBG><CD>?}
'''
PATTERNS_ALT = r'''
    NP:  {<NN.*|JJ.*>*<NN.*><CC><NN.*|VBG><CD>?}
         {<NN.*|JJ.*>*<CD>?<NN.*|VBG><CD>?}
'''
# ('2009', 'CD'), ('Grammy', 'NNP'), ('Awards', 'NNS')
NP_CHUNCKER = RegexpParser(PATTERNS)
EARLY_CANDIDATE_CUTOFF = 25
LATE_CANDIDATE_CUTOFF = 10


def extract_candidates(tagged_sentences):
    '''
    Returns three lists:
        - the candidate key concepts of the given document;
        - the candidate key concepts occurring early in the given document; and 
        - the candidate key concepts occurring late in the given document.
        @param tagged_sentences: The POS tagged document.    
    '''
    #print tagged_sentences
    candidates = []
    early = set([])
Beispiel #38
0
# Regex-based shallow parser.
# The Tree structures used to represent parsed sentences in NLTK get converted to ChunkString objects here.
# Create an object RegexpParser using chunking and chunking rules (classes ChunkRule and ChinkRule)

smple_sntnc = 'The brown fox is quick and he is jumpling over the lazy dog'

# Create POS tagged tokens from sample sentence
tagged_sentence = tag(smple_sntnc)

print(tagged_sentence)

# Create the shallow parser
grammar = """
NP: {<DT>?<JJ>?<NN.*>}
ADJP: {<JJ>}
ADVP: {<RB.*>}
PP: {<IN>}
VP: {<MD>?<VB.*>+}
"""

rc = RegexpParser(grammar)

# Shallow parse the sample sentence
c = rc.parse(tagged_sentence)
print(c)

# Evaluate parser performance on test data
print(rc.evaluate(test_data))

Beispiel #39
0
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag
from nltk.chunk import RegexpParser

nltk.download('averaged_perceptron_tagger')

phrase = "I love Ice Cream. I also like steak"
tokenized_sentences = sent_tokenize(phrase)
tokenized_phrases = [
    word_tokenize(sentence) for sentence in tokenized_sentences
]

tagged_words = [pos_tag(phrase) for phrase in tokenized_phrases]
print(tagged_words)

grammar = r"""
NP: {<PRP|NN|NNP>}
"""

parser = RegexpParser(grammar)

results = [parser.parse(sentence) for sentence in tagged_words]
print(results)
results[0].draw()
Beispiel #40
0
	def __init__(self,grammar="",loop=2):
		super(PostPatternStrategy,self).__init__()		
		self.postChunker=RegexpParser(grammar,loop)
		self.grammar=grammar
		self.loop=loop
Beispiel #41
0
 def __init__(self, setupData):
     super(RegexpChunker, self).__init__(setupData)
     self.chunker = RegexpParser(setupData)
Beispiel #42
0
class Chunker:
    def __init__(self):
        grammar = r'''
            R-DATE: {<IN><CD><TO><CD>}
            R-DATE: {<IN><CD><IN><CD>}
            R-DATE: {<JJ><CD><CC><CD>}
            FULL-DATE: {<IN><CD><NNP><CD>}
            FULL-DATE: <VB.*>{<CD><NNP><CD>}
            MONTH-DATE: {(<IN|DT>)?<NNP><CD>}
            NP: {<JJR><IN><CD><NNS>}
            NP: {<IN><CD><NNS>}
            NP: {<CD><IN><DT><CD><NNS>(<JJ>)?}
            DM_DATE: {<IN><CD><NNP>}(<,>|<NN.*>)
            DATE: {<IN>(<DT>)?<CD>}
            DT-DATE: {<DT><CD>}
            POS-DATE: <POS>{<CD>}
            V-DATE: {<IN|CD><JJ><CD>}
            DATE: (<,>)?{<CD>}<,>
            N-DATE: (<,>)?{((<.*DATE><,>)+)?<CD><CC><CD>}

            NN-LST: {<NN.*>(<,><NN.*>)+(<,>)?<CC><NN.*>}
            NP: {(<RP|IN|NN.*|.*DT|RB|JJ.*|RB.*|POS|``|"|''|FW|POS-DATE|CD|TO|WRB>)*<NN.*>(<TO>(<DT>)?<NN.*>)?(<RB>)?(<IN>)?(<JJ|RB|CD|DT|POS>)*}
            NP: {<P-DATE><NP>}
            NP: {<NP><NP>}
            NP: {<NP><,><NP><,>}
            CC-NP: {<NP>(<CC><NP>)+}

            PP: {((<PDT>)?<DT>)?(<RB|IN|WRB|WDT|TO|JJ|PRP>)*<PRP.*>(<MD>)?}
            PP: {<WP|WRB>}
            PP: {<IN><WDT>(<DT|RBR>)*}
            PP: <,>{<DT><JJ>}

            NP: {<NP><PP><NP>}
            P-NP: {<PP><NP>(<,><NP><,>)?}
            C-PP: {(<CD><PP>|<PP><CD>)}
            CC-P-NP: {<P-NP|PP><CC><NP>}
            NP: {<NP><,>((<,|CC>)*<.*NP>)*<,>}

            VP: {<VB.*><IN><TO><DT><VB.*>}
            VP: {<VB.*><RP>}
            VP: {(<IN|TO|VB.*|.*DT|RB|JJ|EX|MD>)*<VB.*>(<JJ>)?(<RB>(<TO|JJ|>)?)?}
            VP: {<IN><DT><VB.*>(<RB><TO>)?}
            VP: {<RB|VB.*|MD|TO>*<VB.*><RB|VB.*|MD|TO>*}
            VP: {<VP><IN>}
            VP: {<IN><VP>(<RP>)?<TO>}
            VP: {((<DT>)?<IN>)?<WDT><VP>}
            VP: {<IN><DT-DATE><VP>}
            Y-DATE: <JJ>{<CD>}
            VP: {<JJ>}<Y-DATE>
            CC-VP: {<VP><NP><CC><VP><NP>}

            CC-NP: <VP>{<NP>(<,><NP>)*<CC><NP>}
            D-NP : <VP>{<.*DATE><.*NP>}

            CLAUSE-P: <,|CC>{<VP><P-NP>}(<,>|<CC>|<.*DATE>)
            CLAUSE-NS: <,>(<CC>)?{(<VP><.*NP>)+}<,>
            CLAUSE-NS: <CC>{(<VP><.*NP>)+}
            CLAUSE: {<NP>(<VP><.*NP>|<CC-VP>)+(.*P-NP)?}
            CLAUSE-P: {<PP|P-NP>(<VP><.*NP>|<CC-VP>)+}
            CLAUSE-P: <,>{<PP|P-NP><VP>}<,>
            CLAUSE-P: <,>{<PP|P-NP><VP><CLAUSE>}
            CLAUSE: <CC>{<NP><VP><CLAUSE-P>}
            CLAUSE-NS: <,>{<VP><.*NP>}
            CLAUSE-OSL: <CLAUSE-P><CC><,>{<NP>}<,>
            CLAUSE-OSR: <,>{<NP>}<CLAUSE-P>
            CLAUSE: {<NP><CLAUSE-P>}

            D-CLAUSE-P: {<CLAUSE-P><.*DATE>}
            D-CLAUSE-P: <,>{<DATE><CLAUSE-P>}<,>
            D-CLAUSE-P: <,>{<CLAUSE-P><,><VP><.*DATE>}
            D-CLAUSE: {<CLAUSE><.*DATE>}
            D-CLAUSE: {<.*DATE><,><CLAUSE>}<,>
            CLAUSE-NS: {<VP><.*NP>}
            D-CLAUSE-NS: {<CLAUSE-NS><.*DATE>}
            D-CLAUSE-NS: {<VP><NP><.*DATE>}<,>
            D-CLAUSE-NS: <CC>{<.*DATE>(<,>)?<CLAUSE-NS>}
            D-CLAUSE-P: {<P-NP><VP><.*DATE>}


            D-CLAUSE-M-P: {<.*DATE><,><CLAUSE-P>((<,|CC>)+<CLAUSE-P>)+}
            D-CLAUSE-M: {<.*DATE><,><CLAUSE-P>(<,>(<CC>)?<CLAUSE-NS>)+}
            D-CC-CLAUSE: {<.*DATE><CLAUSE><,><CC><CLAUSE>}
            D-CLAUSE: {<.*NP><.*VP><.*DATE>}
            D-CLAUSE: <,>{<.*DATE><.*CLAUSE.*>}
            D-CLAUSE-P: {<CLAUSE-P>(<,>)?(<.*NP>)?<.*DATE>}
            D-CLAUSE-P-L: <D-CLAUSE-P>(<,|CC>)+{<NP>(<,><NP>)*<.*DATE>}
            D-CLAUSE-P: {<.*DATE><,><CLAUSE-P>}
            D-CLAUSE-NS: <.*CLAUSE.*>(<,|CC>)*{<.*DATE>(<,>)?<CLAUSE-NS>}
            DD-CLAUSE: {<D-CLAUSE.*>(<,|CC>)+(<RB>)?<.*DATE>}
            D-CLAUSE-P: {<.*DATE><CLAUSE-P>}(<,>)?
            D-CLAUSE-P: (<,>)?{<CLAUSE-P><CC><D-CLAUSE-NS>}
             '''
        self.chunker = RegexpParser(grammar, loop=1)
        self.exclude = {s for s in string.punctuation if s not in [';', ':', '&', ',', ]}
        self.exclude.add('``')
        self.exclude.add("''")

    def prepare_sentence(self, s: list) -> list:
        s = [n for n in s if n[0] not in self.exclude]
        txt = [w[0] for w in s]
        pos = nltk.pos_tag(txt)
        return [(w, ps, net) for (w, ps), (_, net) in zip(pos, s)]

    @staticmethod
    def tree_label_fix(tree: nltk.tree.Tree) -> nltk.tree.Tree:

        for st in tree:
            if isinstance(st, nltk.tree.Tree):
                if bool(re.match(r'.*CLAUSE.*', st.label())):
                    if not bool(re.match('.*D-.*CLAUSE.*', st.label())):
                        leafs = st.leaves()
                        if any([n for n in leafs if n[2] == 'DATE']):
                            # Fixing the label of the tree
                            new_lbl = 'D-' + st.label()
                            st.set_label(new_lbl)
                            st.label()
                    else:
                        leafs = st.leaves()
                        if not any([n for n in leafs if n[2] == 'DATE']):
                            oldlbl = st.label()
                            new_lbl = re.sub(r'D-', '', oldlbl)
                            st.set_label(new_lbl)
        return tree

    def generate_tree(self, s: list) -> nltk.tree.Tree:
        # noinspection PyTypeChecker
        t1 = self.chunker.parse(s)
        return self.tree_label_fix(t1)
Beispiel #43
0
 def __init__(self, grammar="", loop=2):
     super(PostPatternStrategy, self).__init__()
     self.postChunker = RegexpParser(grammar, loop)
     self.grammar = grammar
     self.loop = loop
#t = npc.parse(tmp_arr_pos[0])
print "Finished loading..."

#print len(t)
#t.draw()
#help(t)
sentCount = 1
sentScore = []          #tuple with (Subj-Obj , Verb-P , )
totalS = []

print "Processing input..."
print "Number of sentences to process: ", len(arr_pos)

for q in ["", vp, prd, cls1, cls2]:
   grammer += q
   npc = RegexpParser(grammer)
   print "\n\n"
   for i in arr_pos:
	print "Reading sentence ", sentCount
	sentCount += 1
	t = npc.parse(i)
	print t
	tmpVP = []
	tmpNP = []
	tmpPrd = []
	tmpCls = []
	x1 = ""
   
	for x in t:
	    try:
		if x.node == "VP":
farechunker = RegexpParser(r'''

CARRIER:
{<CODESHARE><CODESHARE><CODESHARE><CODESHARE>}
{<CODESHARE><CODESHARE><CODESHARE>}
{<CODESHARE><CODESHARE>}
{<CODESHARE><NN>}



ROUTE:
{<ROUTE>}

CABIN:
{<CABIN>}

RBD:
{<BOOKING><CLASS>}


CORPORATE_DISCOUNT:
{<CORPORATE><DISCOUNT>}
{<EFFECTIVE><DISCOUNT>}

AGENT_DISCOUNT:
{<DISCOUNT>}



FBC:
{<FBC><VBD><TO><VBP><DISCOUNT>}
{<FARE><BASIS>}


TICKET_VALIDITY:
{<TICKET><VALIDITY>}


LOCATION:
{<LOCATIONTYPE><NN><.*>}

AIRLINE:
{<CAT><PACIFIC><AIRWAYS><CITY>}

CLIENT:
<AIRLINE>{<.*><.*>}<TOURCODE>



''')
Beispiel #46
0
sentencas_treinadoras = mac_morpho.tagged_sents()[0:15000]
#Cria o UnigramTagger com base no etiquetador padrão e treina-o com as sentenças etiquetadas do mac_morpho
etiq = UnigramTagger(sentencas_treinadoras, backoff=etiqPadrao)

coment = str(input("Entre com o texto: "))
if coment == "default":
        coment = open("default.txt", "r").read().replace("\n", " ")
#O texto é convertido em tokens
tokens=nltk.word_tokenize(coment.lower())
#É etiquetada cada token do texto
tags = etiq.tag(tokens)

#É criado o analisador de expresões regulares contendo os padrões procurados
analiseGramatical = RegexpParser(r"""
		PADRAO7: {<N><ADJ>}
        PADRAO1: {<ADJ><N>(<PREP>?<N>)*}
        PADRAO2: {<ADV><ADV>?<ADJ>(<N>(<PREP>?<N>)*)?}
        PADRAO3: {<N>(<PREP>?<N>)*(<ADJ>)<ADV><ADV>?}
        PADRAO4: {<N>(<PREP>?<N>)*<ADV>?<ADJ>+}
        PADRAO5: {<ADV><V>}
        PADRAO6: {<V><ADV>}
		""")
#O analisador é então utilizado para a geração da árvore de padrões
arvore = analiseGramatical.parse(tags)
x = [ExtractPhrases(arvore, "PADRAO1"), ExtractPhrases(arvore, "PADRAO2"),
     ExtractPhrases(arvore, "PADRAO3"), ExtractPhrases(arvore, "PADRAO4"),
     ExtractPhrases(arvore, "PADRAO5"), ExtractPhrases(arvore, "PADRAO6"),
     ExtractPhrases(arvore, "PADRAO7")]
for aux in range(len(x)):
        print("PADRAO 0"+str(aux+1)+str(x[aux]))