Python RegexpParser.parseの例、nltk.chunk.RegexpParser.parse Pythonの例

コード例 #1

0

ファイルを表示

ファイル: Chunker.py プロジェクト: danjamker/N-Fly

    def Chunk(self,
              sentence,
              node='NP',
              grammer=r"""
                  NP: {<DT|PP\$>?<JJ>*<NN>}
                      {<NNP>+}
                      """):
        '''
        Takes text and returns a list of noune and noun phrases, this is done
        by a form RegEx matching which is included in the NLTK libary.
    
        @param text: the text that is going to be chunked
        @param node='NP': this is which node to chunk 
        @param grammer='NP: {<DT|PP\$>?<JJ>*<NN>}{<NNP>+}': the grammar ReGex to use for chunking 
        
        @return: A nested list of tuples of chunked phrases with pos tagging.
        '''

        tmp = []

        cp = RegexpParser(grammer)

        for sent in sentence:
            for phrase in self.sub_leaves(cp.parse(sent), node):
                tmp.append(phrase)

        results = []
        for phrase in tmp:
            string = ""
            for (word, tag) in phrase:
                string = string + word + " "

            results.append(string[:-1])

        return results

コード例 #2

0

ファイルを表示

 def ProcessWoeds(self, arr):
     tagged = pos_tag(arr)
     chunkGram = r"""Chunk:{<RB.?>*<VB.?>*<NNP>}"""
     chunkParser = RegexpParser(chunkGram)
     chunked = chunkParser.parse(tagged)
     return chunked
     print(chunked)

コード例 #3

0

ファイルを表示

ファイル: Chunker.py プロジェクト: danjamker/N-Fly

    def Chunk(self, sentence, node='NP', grammer=r"""
                  NP: {<DT|PP\$>?<JJ>*<NN>}
                      {<NNP>+}
                      """):
        '''
        Takes text and returns a list of noune and noun phrases, this is done
        by a form RegEx matching which is included in the NLTK libary.
    
        @param text: the text that is going to be chunked
        @param node='NP': this is which node to chunk 
        @param grammer='NP: {<DT|PP\$>?<JJ>*<NN>}{<NNP>+}': the grammar ReGex to use for chunking 
        
        @return: A nested list of tuples of chunked phrases with pos tagging.
        '''

        tmp = []

        cp = RegexpParser(grammer)
             
        for sent in sentence:
            for phrase in self.sub_leaves(cp.parse(sent), node):
                tmp.append(phrase)

        results = []
        for phrase in tmp:
            string = ""
            for (word, tag) in phrase:
                string = string + word + " "
            
            results.append(string[:-1])
            
        return results

コード例 #4

0

ファイルを表示

def get_chunks(tagged_sentences):
    master_list = []
    master_noun = []
    master_adj = []
    grammar = r"""
    CHUNK1:
        {<NN.*><.*>{0,3}<JJ.*>}  # Any Noun terminated with Any Adjective
    
    CHUNK2:
        {<JJ.*><.*>{0,3}<NN.*>}  # Nouns or Adjectives, terminated with Nouns
    """
    cp = RegexpParser(grammar)
    for sent in tagged_sentences:
        tree = cp.parse(sent)
        for subtree in tree.subtrees(
                filter=lambda t: t.label() in ['CHUNK1', 'CHUNK2']):
            if (str(subtree).find('NN') > 0 or str(subtree).find('NNS') > 0
                    or str(subtree).find('NNP') > 0) and (
                        str(subtree).find('JJ') > 0
                        or str(subtree).find('JJS') > 0
                        or str(subtree).find('JJR') > 0):
                nouns = [
                    word for word, tag in subtree.leaves()
                    if tag in ['NN', 'NNS', 'NNP']
                ]
                adjss = [
                    word for word, tag in subtree.leaves()
                    if tag in ['JJ', 'JJR', 'JJS']
                ]
                master_noun.extend([nouns])
                master_adj.extend([adjss])
    return [m[0] + ":" + n[0] for m, n in zip(master_noun, master_adj)]

コード例 #5

0

ファイルを表示

ファイル: Chunker.py プロジェクト: buhtigexa/Nerit

class RegexpChunker(Chunker):
	
	"""
		Este tagger de n-gramas o chunker utiliza gramaticas para detectar frases.
		setupData: es el string de las gramaticas
		
	"""

	
	def __init__(self,setupData):
		super(RegexpChunker,self).__init__(setupData)
		self.chunker=RegexpParser(setupData)


	def tag(self,data): 

	  	if self.fixer_function:
	  		data=self.fixer_function(data)
		iobs=None
		try:
			parsedTree=self.chunker.parse(data)
			iobs= tree2conlltags(parsedTree)
		
		except Exception,e:
			pass
		return iobs

コード例 #6

0

ファイルを表示

ファイル: Chunker.py プロジェクト: magicisland/tpd

class RegexpChunker(Chunker):
	
	"""
		Este tagger de n-gramas o chunker utiliza gramaticas para detectar frases.
		setupData: es el string de las gramaticas
		
	"""

	
	def __init__(self,setupData):
		super(RegexpChunker,self).__init__(setupData)
		self.chunker=RegexpParser(setupData)


	def tag(self,data): 

	  	if self.fixer_function:
	  		data=self.fixer_function(data)
		iobs=None
		try:
			parsedTree=self.chunker.parse(data)
			print parsedTree
			iobs= tree2conlltags(parsedTree)
		
		except Exception,e:
			pass
		return iobs

コード例 #7

0

ファイルを表示

ファイル: nlrequestprocess.py プロジェクト: AlexDel/autobot

def parse_request(message):
    tagPatterns = [
        (r'(honda|toyota|ford|kia|hyundai|audi|bmw|opel|mitsubishi|mazda|skoda|skoda|subaru)$',
         'VENDOR'),
        (r'([a-zA-Z0-9]+)$', 'MODEL'),
        (r'(от|для)$', 'PREP'),
        (r'(нах|бля|твою мать)$', 'PROFANITY'),
        (r'([а-яА-Я]+)$', 'PART_NAME'),
    ]

    tagger = nltk.RegexpTagger(tagPatterns)
    taggedRequest = tagger.tag(nltk.word_tokenize(message))

    chunker = RegexpParser(r'''
	    S: {<CAR> <PREP>? <PART_NAME>}
	    MODEL: {<MODEL>+}
	    VENDOR: {<VENDOR>}
	    CAR: {<VENDOR> <MODEL>}
	    PROFANITY: {<PROFANITY>+}
	    PART_NAME: {<PART_NAME>+}
	''')

    tree = chunker.parse(taggedRequest)

    car = list(tree.subtrees(lambda t: t.label() == 'VENDOR'))
    parsed_request = {}

    # Hack with try except
    try:
        parsed_request['vendor'] = list(
            tree.subtrees(lambda t: t.label() == 'VENDOR'))[0].leaves()[0][0]
    except Exception:
        parsed_request['vendor'] = None
    try:
        parsed_request['model'] = ' '.join([
            leave[0] for leave in list(
                tree.subtrees(lambda t: t.label() == 'MODEL'))[0].leaves()
        ])
    except Exception:
        parsed_request['model'] = None
    try:
        parsed_request['part_name'] = ' '.join([
            leave[0] for leave in list(
                tree.subtrees(lambda t: t.label() == 'PART_NAME'))[0].leaves()
        ])
    except Exception:
        parsed_request['part_name'] = None
    try:
        if len(list(tree.subtrees(lambda t: t.label() == 'PROFANITY'))):
            parsed_request['profanity'] = True
        else:
            parsed_request['profanity'] = False
    except Exception:
        parsed_request['profanity'] = False

    return parsed_request

コード例 #8

0

ファイルを表示

ファイル: nltk_processors.py プロジェクト: awoziji/forte

class NLTKChunker(PackProcessor):
    r"""A wrapper of NLTK chunker.
    """
    def __init__(self):
        super().__init__()
        self.chunker = None

    # pylint: disable=unused-argument
    def initialize(self, resources: Resources, configs: Config):
        super().initialize(resources, configs)
        self.chunker = RegexpParser(configs.pattern)

    @classmethod
    def default_configs(cls):
        r"""This defines a basic config structure for NLTKChunker.
        """
        config = super().default_configs()
        config.update({
            'pattern': 'NP: {<DT>?<JJ>*<NN>}',
            'token_component': None,
            'sentence_component': None
        })
        return config

    def _process(self, input_pack: DataPack):
        for sentence in input_pack.get(
                Sentence, components=self.configs.sentence_component):
            token_entries = list(
                input_pack.get(entry_type=Token,
                               range_annotation=sentence,
                               components=self.configs.token_component))

            tokens = [(token.text, token.pos) for token in token_entries]
            cs = self.chunker.parse(tokens)

            index = 0
            for chunk in cs:
                if hasattr(chunk, 'label'):
                    # For example:
                    # chunk: Tree('NP', [('This', 'DT'), ('tool', 'NN')])
                    begin_pos = token_entries[index].span.begin
                    end_pos = token_entries[index + len(chunk) - 1].span.end
                    phrase = Phrase(input_pack, begin_pos, end_pos)
                    phrase.phrase_type = chunk.label()

                    index += len(chunk)
                else:
                    # For example:
                    # chunk: ('is', 'VBZ')
                    index += 1

コード例 #9

0

ファイルを表示

ファイル: exercise_03.py プロジェクト: psnint/IST-Projects

def generate_chunks(tagged_sent, expression=r'CHUNK: {(<adj>* <n.*>+ <prp>)? <adj>* <n.*>+}'):
    chunks = []
    chunkParser = RegexpParser(expression)
    try:
        if len(tagged_sent) == 0:
            tree = Tree('S', [])
        else:
            tree = chunkParser.parse(tagged_sent, trace=0)
        for subtree in tree.subtrees():
            if subtree.label() == "CHUNK":
                chunks.append(subtree.leaves())
    except ValueError:
        chunks = []
    return chunks

コード例 #10

0

ファイルを表示

    def _chunker(self, tuple_sent):
        """Chunk base-phrases using chunking rules.

        Args:
            tuple_sent (list(tuple(str, str)))

        Returns:
            chunk_struct Tree('S', [Tree('CHUNK', [(str, str), (str, str)]], (str, str), ...): chunked sentence
        """
        chunkTreeList = []

        chunker = RegexpParser(self._ChunkingRule(self._CHUNK_RULE_VXP_))

        chunk_struct = chunker.parse(tuple_sent)

        return chunk_struct

コード例 #11

0

ファイルを表示

ファイル: keywords.py プロジェクト: matthiase/bitbucket

def find_keywords(text):
  """
    Extracts keywords from text.

    Args:
      text: A text fragment.

    Returns:
      A list containing the extracted keywords.
  """
  grammar = r'''
    KEYWORD: {<NNP><NNP>+}
        {<NN.*><NN.*>+}
        {<JJ>+<NN>+}
  '''
  parser = RegexpParser(grammar)
  sentences = [ ]
  words = [ ]
  keywords = [ ]
  for sentence in sent_tokenize(text):
    tokens = word_tokenize(sentence)
    if not tokens: continue
    sentences.append(tokens)
    words += tokens

  collocations = find_collocations(words)

  for sentence in sentences:
    tree = parser.parse(pos_tag(sentence))
    for node in _select_nodes(tree, ['KEYWORD']):
      word = ' '.join(map(lambda p: p[0], node))
      if word in collocations:
        keywords.append(word)

  keywords = sorted(keywords, key=lambda k: len(k.split()), reverse=True) 
  instances = { }
  for k in keywords:
    key = k
    for existing in instances.keys():
      if re.match(k, existing):
        key = existing
        break
    instances[key] = instances.get(key, 0) + 1 
  results = instances.items()
  results.sort(key=lambda item: int(item[1]), reverse=True)
  return map(lambda item: item[0], results)

コード例 #12

0

ファイルを表示

def rule_based_reqs_chunk(tagged_reqs, ids):
    chunker = RegexpParser(ruleset)
    terms = []
    term_index = []
    for i, t in enumerate(tagged_reqs):
        s = chunker.parse(t)
        for c in s:
            if not isinstance(c, tuple):
                if c.label() == 'NP':
                    term = []
                    for tagged_word in c:
                        if (tagged_word[1] != 'DT') and (tagged_word[1] !=
                                                         'PRP$'):
                            term = term + [tagged_word[0]]
                    terms.append(term)
                    term_index.append(i)
    return terms, term_index

コード例 #13

0

ファイルを表示

class NLTKChunker(PackProcessor):
    r"""A wrapper of NLTK chunker.
    """
    def __init__(self):
        super().__init__()
        self.chunker = None
        self.token_component = None

    # pylint: disable=unused-argument
    def initialize(self, resource: Resources, configs: HParams):
        self.chunker = RegexpParser(configs.pattern)

    @staticmethod
    def default_configs():
        r"""This defines a basic config structure for NLTKChunker.
        """
        return {
            'pattern': 'NP: {<DT>?<JJ>*<NN>}',
        }

    def _process(self, input_pack: DataPack):
        for sentence in input_pack.get(Sentence):
            token_entries = list(
                input_pack.get(entry_type=Token,
                               range_annotation=sentence,
                               component=self.token_component))
            tokens = [(token.text, token.pos) for token in token_entries]
            cs = self.chunker.parse(tokens)

            index = 0
            for chunk in cs:
                if hasattr(chunk, 'label'):
                    # For example:
                    # chunk: Tree('NP', [('This', 'DT'), ('tool', 'NN')])
                    begin_pos = token_entries[index].span.begin
                    end_pos = token_entries[index + len(chunk) - 1].span.end
                    phrase = Phrase(input_pack, begin_pos, end_pos)
                    kwargs_i = {"phrase_type": chunk.label()}
                    phrase.set_fields(**kwargs_i)
                    input_pack.add_or_get_entry(phrase)
                    index += len(chunk)
                else:
                    # For example:
                    # chunk: ('is', 'VBZ')
                    index += 1

コード例 #14

0

ファイルを表示

	def parse(self):
		"""
			Parse le texte tokenisé à l'aide de notre grammaire créé pour récupérer les groupes de mots 
			contenant une NE.
		"""
		if self.own_tag:
			rp = RegexpParser(Parser.GRAMMAR_OWN_TAG)
		else:
			rp = RegexpParser(Parser.GRAMMAR)
		tree = rp.parse(self.tokens)

		for subtree in tree.subtrees():
			if subtree.label() == "S":
				continue
			self.tagged_nodes.append(
				[subtree.label(), subtree.leaves()]
			)
		print(self.tagged_nodes)

コード例 #15

0

ファイルを表示

ファイル: seva_toie.py プロジェクト: jitinkrishnan/NASA-SE

def additionalExtractions(dep_triples, tagged_sentence, svo_triples):
    if not svo_triples:
        return None
    grammar = "SmallNP: {(<CD.*>|<JJ.*>)<NN.*>+}"
    cp = RegexpParser(grammar)
    chunk = cp.parse(tagged_sentence)
    triple_array = []
    for subtree in chunk.subtrees():
        if subtree.label() == 'SmallNP':
            for triple in svo_triples:
                pos = subtree.leaves()
                loc1 = tag_index(pos, triple[0])
                if loc1 != -1:
                    triple_array.extend(chunk_triples(pos, loc1))
                loc2 = tag_index(pos, triple[2])
                if loc2 != -1:
                    triple_array.extend(chunk_triples(pos, loc2))
    return triple_array

コード例 #16

0

ファイルを表示

	def preprocessing(self,desc):
		desc = desc.replace(","," ")
		desc = desc.replace("!","")
		desc = desc.replace("@","")
		desc = desc.replace("#","")
		desc = desc.replace("%","")
		desc = desc.replace("(","")
		desc = desc.replace(")","")
		desc = desc.replace(":","")
		desc = desc.replace("{","")
		desc = desc.replace("}","")
		desc = desc.replace("`","")
		desc = desc.replace("[","")
		desc = desc.replace("]","")
		desc = desc.replace("'","")
		desc = desc.replace("*","")
		desc = desc.replace("&","")
		desc = desc.replace("^","")
		print desc
		if "I/O" in desc:
			desc = desc.replace("I/O","IO")
		desc = desc.replace("/"," and ")
		tokenized = nltk.word_tokenize(desc)
		posTag = nltk.pos_tag(tokenized)
		grammar = '''
		RB: {<RB> | <RBS> | <RBR>}'''
		chunker = RegexpParser(grammar)
		chunked = chunker.parse(posTag)
		print chunked

		for n in range(len(chunked)):
			if str(chunked[n]).startswith('(RB') is True:
				if n is 0 :
					s = str(chunked[n]).split(" ")
					ss = s[1].split("/")
					removalWord = ss[0]
					desc = desc.replace(removalWord+" ","")
				if n>0 and n<=len :
					s = str(chunked[n]).split(" ")
					ss = s[1].split("/")
					removalWord = ss[0]
					desc = desc.replace(" "+removalWord,"")
		return desc

コード例 #17

0

ファイルを表示

    def exctract_ngrams(self, tagged_sent):
	'''
	Exctract ngrams, given a list of chunk rules for the previously tagged sentence.

	Keyword arguments:
	@param tagged_sent the POST tagged sentence whose ngrams need to be exctracted
	'''

        chunker = RegexpParser(CHUNK_RULE)
        tree = chunker.parse(tagged_sent)
        ngrams = []
        for item in self.__leaves(tree):
            if not item == tagged_sent:
                probable_ngram = ' '.join(self.__stemmer.stem(
                    word.lower()) for (word, pos) in item
                )
                if self.__evaluate_polarity_ngram(probable_ngram):
                    ngrams.append(probable_ngram)
        return ngrams

コード例 #18

0

ファイルを表示

ファイル: chunker.py プロジェクト: AdamMeyers/The_Termolator

def extractPossibleTerms(root, fileids):
    # get corpus
    #root, filename = os.path.split(path)
    reader = PlaintextCorpusReader(root, fileids)
    # get chunker
    grammar = 'NP: {<JJ>*<NNP>*<NN>*}'
    chunker = RegexpParser(grammar)
    # get terms
    terms = set()
    print len(reader.sents())
    i = 0
    for sent in reader.sents():
        i += 1
        if i%100==0:
            print i
        tree = chunker.parse(pos_tag(sent))
        for t in tree.subtrees(lambda t: t.node!='S'): # exclude Sentence node
            terms.add(' '.join([el[0] for el in t]))
    return terms

コード例 #19

0

ファイルを表示

ファイル: noun_pharse.py プロジェクト: naveenjr/Nounpharse-extraction

def chunking_noun(document):
    #Get the words in the document
    words = word_tokenize(document)
    tagged = nltk.pos_tag(words)
    counts = Counter(tag for WORD, tag in tagged)
    counts = dict(counts)
    #print(counts)
    chunkGram = r""" PHRASE: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}"""
    chunkParser = RegexpParser(chunkGram)
    chunked = chunkParser.parse(tagged)
    serch_keywords = []
    for tree in chunked.subtrees():
        if tree.label() == 'PHRASE':
            serch_keyword = ' '.join([x for x, y in tree.leaves()])
            serch_keywords.append(serch_keyword)
    serch_keywords = [
        w for w in serch_keywords
        if len(w.split(' ')) > 1 and len(w.split(' ')) <= 3
    ]
    return serch_keywords, tagged, counts

コード例 #20

0

ファイルを表示

ファイル: utils.py プロジェクト: psnint/IST-Projects

def get_noun_phrases(text_list, tagger):
    noun_phrases = []
    tagged_texts = [tagger.tag(text.split()) for text in text_list]

    expression = r'NOUN_PHRASE: {(<adj>* <n.*>+ <prp>)? <adj>* <n.*>+}'

    chunkParser = RegexpParser(expression)

    for tagged_sent in tagged_texts:
        try:
            if len(tagged_sent) == 0:
                tree = Tree('S', [])
            else:
                tree = chunkParser.parse(tagged_sent, trace=0)
            for subtree in tree.subtrees():
                if subtree.label() == "NOUN_PHRASE":
                    noun_phrases.append([el[0] for el in subtree.leaves()])
        except ValueError:
            noun_phrases = []
    return noun_phrases

コード例 #21

0

ファイルを表示

def extract_candidate_phrases(document_obj, parts_of_speech_re=DEFAULT_RE):
    '''
    :param document_obj: document from which you want to extract parts of the speech (candidate phrases)
    :param parts_of_speech_re: regular expression with parts of speech structure
    :return: dict, keys are the sentence id and values list of candidate phrases for that sentence
    '''
    candidate_phrases = {}

    # get sentences of the document
    sentences = document_obj.get_sentences()

    # for each sentence
    for sentence in sentences:
        sentence_id = sentence.get_sentence_id()

        # get tokens
        tokens_objs = sentence.get_tokens()

        # list of tuples with token and its pos
        token_pos_list = [(token_obj.get_token_str(),
                           token_obj.get_token_pos())
                          for token_obj in tokens_objs]

        # create regex parser with regular expression of tags
        regex_parser = RegexpParser(parts_of_speech_re)
        sentence_regex_tree = regex_parser.parse(token_pos_list)

        # get all subtrees with NP label
        match_subtrees = sentence_regex_tree.subtrees(
            filter=lambda t: t.label() == STAGE_MARKER)
        sentence_candidate_phrases = []

        # add candidate phrases
        for subtree in match_subtrees:
            leaves_str = ' '.join(
                [leave_token_pos[0] for leave_token_pos in subtree.leaves()])
            sentence_candidate_phrases.append(leaves_str)

        candidate_phrases[sentence_id] = sentence_candidate_phrases

    return candidate_phrases

コード例 #22

0

ファイルを表示

class TreeChunker(ContextChunker):
    def __init__(self,
                 patterns: str,
                 loop: int = 1,
                 trace: int = 0,
                 attribute: str = 'pos',
                 apply_iob2: bool = True) -> None:
        self.__attribute = attribute
        self.__regex_parser = RegexpParser(patterns,
                                           root_label='',
                                           loop=loop,
                                           trace=trace)
        self.__apply_iob2 = apply_iob2

    def tag(self, context: Context) -> List[str]:
        tokens_to_chunk = [
            'NULL' if tk == '' else tk for tk in context.get(self.__attribute)
        ]

        chunk_struct = list(zip(context.get('tokens'), tokens_to_chunk))

        return self._traverse_tree(self.__regex_parser.parse(chunk_struct))

    def _traverse_tree(self, tree, is_subtree: bool = False):
        tags = []
        for i, subtree in enumerate(tree):
            if isinstance(subtree, nltk.tree.Tree):
                tags.extend(self._traverse_tree(subtree, True))
            else:
                tag = tree.label()
                if is_subtree:
                    index = ''
                    if self.__apply_iob2:
                        index = 'B-' if i == 0 else 'I-'

                    tag = f'{index}{tag}'

                tags.append(tag)

        return tags

コード例 #23

0

ファイルを表示

ファイル: PostPatternStrategy.py プロジェクト: buhtigexa/Nerit

class PostPatternStrategy(Strategy):
    """
		Hay casos en que las frases que deseamos detectar se basan en las palabras mas que en su clase gramatical. Tambien podemos 
		ser mas precisos si podemos considerar distintos niveles del arbol, por ejemplo frases y palabras juntas dentro de una regla como un unico token.
		Esta estrategia permite mirar el arbol en altura y ancho, de modo que las gramaticas que escribamos podran ser mas presicas y flexibles.


	"""
    def __init__(self, grammar="", loop=2):
        super(PostPatternStrategy, self).__init__()
        self.postChunker = RegexpParser(grammar, loop)
        self.grammar = grammar
        self.loop = loop

    def fix(self, feature):

        cleanSentence = feature
        tree = None
        try:

            grammar_pattern_to_clean = r'_.*'  # caracter de separacion de niveles dentro de un mismo token.
            clean_pattern = ''
            modified_chunk_pattern = r'.*_'
            words, post, iobs = zip(*feature)
            wiobs = tuple(
                w + "_" + iob for w, iob in zip(words, iobs)
            )  # las sentencias a parsear ahora no consideran el POS TAG, sino IOBS y palabras.
            sentence = zip(words, wiobs)
            tree = self.postChunker.parse(sentence)
            loc_tags = tree2conlltags(flatten_deeptree(
                tree))  # voy de arbol a lista de tuplas de nuevo.
            cleanSentence = cleanIobs(words, post, loc_tags,
                                      grammar_pattern_to_clean,
                                      modified_chunk_pattern, clean_pattern)

        except Exception, e:
            pass

        return cleanSentence

コード例 #24

0

ファイルを表示

ファイル: search_tags.py プロジェクト: bhargavaganti/NLBooking

def get_search_tags(a, verbose=False):
    if verbose:
        print()
        print('-' * 100)
        print("\tRunning `get_search_tags`...")
        print('-' * 100)

    search_tag_parser = RegexpParser("STAG: {\
        (<RB>|<RBR>|<RBS>|<VB>|<VB[A-Z]>|<IN>|<CC>)\
        (<JJ>|<JJR>|<JJS>|<DT>)\
        (<NN>|<NNS>|<NNP>|<NNPS>)+\
        }")

    pos_tags = pos_tag(word_tokenize(a))
    if verbose:
        print("Part of Speech Tags:", pos_tags, '\n')

    data = search_tag_parser.parse(pos_tags)
    if verbose:
        print("Matched Search Tags:", data)

    return extract_tags(data)

コード例 #25

0

ファイルを表示

ファイル: keywords.py プロジェクト: neelkanthpoosa/KeywordExtractionApp

def extract_candidate_keywords(document):

    #Get the words in the document
    words = word_tokenize(document)

    # Chunk first to get 'Candidate Keywords'
    tagged = nltk.pos_tag(words)
    chunkGram = r""" PHRASE: 
                        {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}
                """

    chunkParser = RegexpParser(chunkGram)
    chunked = chunkParser.parse(tagged)

    candidate_keywords = []
    for tree in chunked.subtrees():
        if tree.label() == 'PHRASE':
            candidate_keyword = ' '.join([x for x,y in tree.leaves()])
            candidate_keywords.append(candidate_keyword)

    candidate_keywords = [w for w in candidate_keywords if len(w) > 3 and  len(w.split(' ')) < 6]
    #print("Data XYZ:",candidate_keywords) 
    return candidate_keywords

コード例 #26

0

ファイルを表示

ファイル: PostPatternStrategy.py プロジェクト: buhtigexa/Nerit

class PostPatternStrategy(Strategy):

	"""
		Hay casos en que las frases que deseamos detectar se basan en las palabras mas que en su clase gramatical. Tambien podemos 
		ser mas precisos si podemos considerar distintos niveles del arbol, por ejemplo frases y palabras juntas dentro de una regla como un unico token.
		Esta estrategia permite mirar el arbol en altura y ancho, de modo que las gramaticas que escribamos podran ser mas presicas y flexibles.


	"""
	
	def __init__(self,grammar="",loop=2):
		super(PostPatternStrategy,self).__init__()		
		self.postChunker=RegexpParser(grammar,loop)
		self.grammar=grammar
		self.loop=loop

	def fix(self, feature):
		
		cleanSentence=feature
		tree=None
		try:
			
			grammar_pattern_to_clean=r'_.*' # caracter de separacion de niveles dentro de un mismo token.
			clean_pattern=''
			modified_chunk_pattern=r'.*_'
			words,post,iobs=zip(*feature)
			wiobs=tuple(w+"_"+iob for w,iob in zip(words,iobs)) # las sentencias a parsear ahora no consideran el POS TAG, sino IOBS y palabras.
			sentence=zip(words,wiobs)
			tree=self.postChunker.parse(sentence)
		  	loc_tags=tree2conlltags(flatten_deeptree(tree)) # voy de arbol a lista de tuplas de nuevo.
			cleanSentence=cleanIobs(words,post,loc_tags,grammar_pattern_to_clean,modified_chunk_pattern,clean_pattern)
	  		

		except Exception,e:
			pass

	  	return cleanSentence

コード例 #27

0

ファイルを表示

ファイル: shallow_parsing.py プロジェクト: 000Nelson000/text-analytics-with-python

test_data = data[4000:]
print train_data[7]

simple_sentence = 'the quick fox jumped over the lazy dog'

from nltk.chunk import RegexpParser
from pattern.en import tag

tagged_simple_sent = tag(simple_sentence)
print tagged_simple_sent

chunk_grammar = """
NP: {<DT>?<JJ>*<NN.*>}
"""
rc = RegexpParser(chunk_grammar)
c = rc.parse(tagged_simple_sent)
print c

chink_grammar = """
NP: {<.*>+} # chunk everything as NP
}<VBD|IN>+{
"""
rc = RegexpParser(chink_grammar)
c = rc.parse(tagged_simple_sent)
print c

tagged_sentence = tag(sentence)
print tagged_sentence

grammar = """
NP: {<DT>?<JJ>?<NN.*>}

コード例 #28

0

ファイルを表示

ファイル: patternsManager.py プロジェクト: roneysco/TopX

def GetPatternsTree(tagsList, pattern, patternName):
	gramaticalAnalyse = RegexpParser(pattern)
	tree = gramaticalAnalyse.parse(tagsList)
	patt = ExtractPhrases(tree, patternName)
	return patt

コード例 #29

0

ファイルを表示

ファイル: Parser.py プロジェクト: thetomcraig/EROS-old

	def chunk(self, posTaggedQuote):
		'''Holds the chunkers used by the condensed class'''
		quoteItemCondensedList = [] 								   #Need to zero this our for testing, might take away later
		EMPChunker = RegexpParser(r"""
		EMP:                                                           #Emotion Phrase
			{<MD>(<VBP>|<VB>|<VBZ>|<VBD>)<JJ><,><CC>}                  #Modular, verb, anything, adjective, comma, conjunction
			{<MD>(<VBP>|<VB>|<VBZ>|<VBD>)<JJ><CC>}                     #Modular, verb, anything, adjective, conjunction
			{(<VBP>|<VB>|<VBZ>|<VBD>)<JJ><,><CC>}                      #Verb, anything, adjective, comma, conjunction
		    {(<VBP>|<VB>|<VBZ>|<VBD>)<JJ><CC>}                         #Verb, anything, adjective, conjunction

		    {(<VBP>|<VB>|<VBZ>|<VBD>)<JJ><CC>}	                       #Verb, anything, adjective, conjunction

			{(<VBP>|<VB>|<VBZ>|<VBD>)<RB><JJ>}                         #Verb, adverb, adjective
		   	{<MD><RB>(<VBP>|<VB>|<VBZ>|<VBD>)<JJ>}                     #Modular, adverb, verb, adjective
			{<RB>(<VBP>|<VB>|<VBZ>|<VBD>)<JJ>}                         #Adverb, verb, adjective

			{<MD>(<VBP>|<VB>|<VBZ>|<VBD>)<JJ>}                         #Modular, verb, anything, adjective
			{(<VBP>|<VB>|<VBZ>|<VBD>)<TO>(<VBP>|<VB>|<VBZ>|<VBD>)<JJ>} #Verb, "to", verb anything, adjective
			{(<VBP>|<VB>|<VBZ>|<VBD>)<JJ>}                             #Verb, anything, adjective

		""")
		PRPHChunker = RegexpParser(r"""
		PRPH:                                                          #Preposition Phrase
			{<.*>*<PRP><.*>*<EMP>}				                       #Anything, proposition, anything
			{<EMP><.*>*<PRP><.*>*}				                       #Anything, proposition, anything
			}<EMP>{													   #Chink at the EMP chunk, recursion!
		""")

		#This is going to have to be recursive, to chunk the entire phrase
		#This section chunkes, and condenses, the EMP chunk becomes "EMP"
		#Then sets the happy level of the condesned quoteItem
		EMPChunked = EMPChunker.parse(posTaggedQuote)
		for piece in EMPChunked:
			if type(piece) != tuple:
				#self.quoteItemCondensedList.append((piece, 'EMP'))			#TESTING
				self.quoteItemCondensedList.append(('','EMP'))			#TESTING
			else:
				self.quoteItemCondensedList.append(piece)
		self.printCondensed()


		#Simulating the recursion, PRP chunk next
		#Want to chunk everything seperately, then figure out the best recursive algorithm
		newQuoteItemCondensedList = self.quoteItemCondensedList
		self.quoteItemCondensedList = []									#Clear the list to condense more
		PRPHChunked = PRPHChunker.parse(newQuoteItemCondensedList)
		for piece in PRPHChunked:
			if type(piece) != tuple:
				#self.quoteItemCondensedList.append((piece, 'PRPH'))		#TESTING
				self.quoteItemCondensedList.append(('','PRPH'))				#TESTING
			else:
				self.quoteItemCondensedList.append(piece)
		self.printCondensed()

		newQuoteItemCondensedList = self.quoteItemCondensedList
		self.quoteItemCondensedList = []									#Clear the list to condense more
		PRPHChunked = PRPHChunker.parse(newQuoteItemCondensedList)
		for piece in PRPHChunked:
			if type(piece) != tuple:
				#self.quoteItemCondensedList.append((piece, 'PRPH'))		#TESTING
				self.quoteItemCondensedList.append(('','PRPH'))				#TESTING
			else:
				self.quoteItemCondensedList.append(piece)
		self.printCondensed()

コード例 #30

0

ファイルを表示

ファイル: Information Extraction.py プロジェクト: NGrech/FYP

class Chunker:
    def __init__(self):
        grammar = r'''
            R-DATE: {<IN><CD><TO><CD>}
            R-DATE: {<IN><CD><IN><CD>}
            R-DATE: {<JJ><CD><CC><CD>}
            FULL-DATE: {<IN><CD><NNP><CD>}
            FULL-DATE: <VB.*>{<CD><NNP><CD>}
            MONTH-DATE: {(<IN|DT>)?<NNP><CD>}
            NP: {<JJR><IN><CD><NNS>}
            NP: {<IN><CD><NNS>}
            NP: {<CD><IN><DT><CD><NNS>(<JJ>)?}
            DM_DATE: {<IN><CD><NNP>}(<,>|<NN.*>)
            DATE: {<IN>(<DT>)?<CD>}
            DT-DATE: {<DT><CD>}
            POS-DATE: <POS>{<CD>}
            V-DATE: {<IN|CD><JJ><CD>}
            DATE: (<,>)?{<CD>}<,>
            N-DATE: (<,>)?{((<.*DATE><,>)+)?<CD><CC><CD>}

            NN-LST: {<NN.*>(<,><NN.*>)+(<,>)?<CC><NN.*>}
            NP: {(<RP|IN|NN.*|.*DT|RB|JJ.*|RB.*|POS|``|"|''|FW|POS-DATE|CD|TO|WRB>)*<NN.*>(<TO>(<DT>)?<NN.*>)?(<RB>)?(<IN>)?(<JJ|RB|CD|DT|POS>)*}
            NP: {<P-DATE><NP>}
            NP: {<NP><NP>}
            NP: {<NP><,><NP><,>}
            CC-NP: {<NP>(<CC><NP>)+}

            PP: {((<PDT>)?<DT>)?(<RB|IN|WRB|WDT|TO|JJ|PRP>)*<PRP.*>(<MD>)?}
            PP: {<WP|WRB>}
            PP: {<IN><WDT>(<DT|RBR>)*}
            PP: <,>{<DT><JJ>}

            NP: {<NP><PP><NP>}
            P-NP: {<PP><NP>(<,><NP><,>)?}
            C-PP: {(<CD><PP>|<PP><CD>)}
            CC-P-NP: {<P-NP|PP><CC><NP>}
            NP: {<NP><,>((<,|CC>)*<.*NP>)*<,>}

            VP: {<VB.*><IN><TO><DT><VB.*>}
            VP: {<VB.*><RP>}
            VP: {(<IN|TO|VB.*|.*DT|RB|JJ|EX|MD>)*<VB.*>(<JJ>)?(<RB>(<TO|JJ|>)?)?}
            VP: {<IN><DT><VB.*>(<RB><TO>)?}
            VP: {<RB|VB.*|MD|TO>*<VB.*><RB|VB.*|MD|TO>*}
            VP: {<VP><IN>}
            VP: {<IN><VP>(<RP>)?<TO>}
            VP: {((<DT>)?<IN>)?<WDT><VP>}
            VP: {<IN><DT-DATE><VP>}
            Y-DATE: <JJ>{<CD>}
            VP: {<JJ>}<Y-DATE>
            CC-VP: {<VP><NP><CC><VP><NP>}

            CC-NP: <VP>{<NP>(<,><NP>)*<CC><NP>}
            D-NP : <VP>{<.*DATE><.*NP>}

            CLAUSE-P: <,|CC>{<VP><P-NP>}(<,>|<CC>|<.*DATE>)
            CLAUSE-NS: <,>(<CC>)?{(<VP><.*NP>)+}<,>
            CLAUSE-NS: <CC>{(<VP><.*NP>)+}
            CLAUSE: {<NP>(<VP><.*NP>|<CC-VP>)+(.*P-NP)?}
            CLAUSE-P: {<PP|P-NP>(<VP><.*NP>|<CC-VP>)+}
            CLAUSE-P: <,>{<PP|P-NP><VP>}<,>
            CLAUSE-P: <,>{<PP|P-NP><VP><CLAUSE>}
            CLAUSE: <CC>{<NP><VP><CLAUSE-P>}
            CLAUSE-NS: <,>{<VP><.*NP>}
            CLAUSE-OSL: <CLAUSE-P><CC><,>{<NP>}<,>
            CLAUSE-OSR: <,>{<NP>}<CLAUSE-P>
            CLAUSE: {<NP><CLAUSE-P>}

            D-CLAUSE-P: {<CLAUSE-P><.*DATE>}
            D-CLAUSE-P: <,>{<DATE><CLAUSE-P>}<,>
            D-CLAUSE-P: <,>{<CLAUSE-P><,><VP><.*DATE>}
            D-CLAUSE: {<CLAUSE><.*DATE>}
            D-CLAUSE: {<.*DATE><,><CLAUSE>}<,>
            CLAUSE-NS: {<VP><.*NP>}
            D-CLAUSE-NS: {<CLAUSE-NS><.*DATE>}
            D-CLAUSE-NS: {<VP><NP><.*DATE>}<,>
            D-CLAUSE-NS: <CC>{<.*DATE>(<,>)?<CLAUSE-NS>}
            D-CLAUSE-P: {<P-NP><VP><.*DATE>}


            D-CLAUSE-M-P: {<.*DATE><,><CLAUSE-P>((<,|CC>)+<CLAUSE-P>)+}
            D-CLAUSE-M: {<.*DATE><,><CLAUSE-P>(<,>(<CC>)?<CLAUSE-NS>)+}
            D-CC-CLAUSE: {<.*DATE><CLAUSE><,><CC><CLAUSE>}
            D-CLAUSE: {<.*NP><.*VP><.*DATE>}
            D-CLAUSE: <,>{<.*DATE><.*CLAUSE.*>}
            D-CLAUSE-P: {<CLAUSE-P>(<,>)?(<.*NP>)?<.*DATE>}
            D-CLAUSE-P-L: <D-CLAUSE-P>(<,|CC>)+{<NP>(<,><NP>)*<.*DATE>}
            D-CLAUSE-P: {<.*DATE><,><CLAUSE-P>}
            D-CLAUSE-NS: <.*CLAUSE.*>(<,|CC>)*{<.*DATE>(<,>)?<CLAUSE-NS>}
            DD-CLAUSE: {<D-CLAUSE.*>(<,|CC>)+(<RB>)?<.*DATE>}
            D-CLAUSE-P: {<.*DATE><CLAUSE-P>}(<,>)?
            D-CLAUSE-P: (<,>)?{<CLAUSE-P><CC><D-CLAUSE-NS>}
             '''
        self.chunker = RegexpParser(grammar, loop=1)
        self.exclude = {s for s in string.punctuation if s not in [';', ':', '&', ',', ]}
        self.exclude.add('``')
        self.exclude.add("''")

    def prepare_sentence(self, s: list) -> list:
        s = [n for n in s if n[0] not in self.exclude]
        txt = [w[0] for w in s]
        pos = nltk.pos_tag(txt)
        return [(w, ps, net) for (w, ps), (_, net) in zip(pos, s)]

    @staticmethod
    def tree_label_fix(tree: nltk.tree.Tree) -> nltk.tree.Tree:

        for st in tree:
            if isinstance(st, nltk.tree.Tree):
                if bool(re.match(r'.*CLAUSE.*', st.label())):
                    if not bool(re.match('.*D-.*CLAUSE.*', st.label())):
                        leafs = st.leaves()
                        if any([n for n in leafs if n[2] == 'DATE']):
                            # Fixing the label of the tree
                            new_lbl = 'D-' + st.label()
                            st.set_label(new_lbl)
                            st.label()
                    else:
                        leafs = st.leaves()
                        if not any([n for n in leafs if n[2] == 'DATE']):
                            oldlbl = st.label()
                            new_lbl = re.sub(r'D-', '', oldlbl)
                            st.set_label(new_lbl)
        return tree

    def generate_tree(self, s: list) -> nltk.tree.Tree:
        # noinspection PyTypeChecker
        t1 = self.chunker.parse(s)
        return self.tree_label_fix(t1)

コード例 #31

0

ファイルを表示

from nltk import Tree, RegexpChunkParser
from nltk.chunk import RegexpParser
from nltk.chunk.regexp import ChunkString, ChunkRule, ChinkRule

s = [('the', 'DT'), ('book', 'NN'), ('has', 'VBZ'), ('many', 'JJ'),
     ('chapters', 'NNS')]
# forth
chunker = RegexpParser(r'''
NP:
    {<DT><NN.*><.*>*<NN.*>}
    }<VB.*>{''')

print(chunker.parse(s))

# back
t = Tree('S', s)
cs = ChunkString(t)
print(cs)

ur = ChunkRule('<DT><NN.*><.*>*<NN.*>', 'chunk determiners and nouns')
ur.apply(cs)
print(cs)

ir = ChinkRule('<VB.*>', 'chink verbs')
ir.apply(cs)
print(cs)

print(cs.to_chunkstruct())
# cs.to_chunkstruct().draw()

chunker = RegexpChunkParser([ur, ir])

コード例 #32

0

ファイルを表示

# Regex-based shallow parser.
# The Tree structures used to represent parsed sentences in NLTK get converted to ChunkString objects here.
# Create an object RegexpParser using chunking and chunking rules (classes ChunkRule and ChinkRule)

smple_sntnc = 'The brown fox is quick and he is jumpling over the lazy dog'

# Create POS tagged tokens from sample sentence
tagged_sentence = tag(smple_sntnc)

print(tagged_sentence)

# Create the shallow parser
grammar = """
NP: {<DT>?<JJ>?<NN.*>}
ADJP: {<JJ>}
ADVP: {<RB.*>}
PP: {<IN>}
VP: {<MD>?<VB.*>+}
"""

rc = RegexpParser(grammar)

# Shallow parse the sample sentence
c = rc.parse(tagged_sentence)
print(c)

# Evaluate parser performance on test data
print(rc.evaluate(test_data))

コード例 #33

0

ファイルを表示

ファイル: chunks.py プロジェクト: anderscui/nlpy

from nltk import Tree, RegexpChunkParser
from nltk.chunk import RegexpParser
from nltk.chunk.regexp import ChunkString, ChunkRule, ChinkRule

s = [('the', 'DT'), ('book', 'NN'), ('has', 'VBZ'), ('many', 'JJ'), ('chapters', 'NNS')]
# forth
chunker = RegexpParser(r'''
NP:
    {<DT><NN.*><.*>*<NN.*>}
    }<VB.*>{'''
)

print(chunker.parse(s))

# back
t = Tree('S', s)
cs = ChunkString(t)
print(cs)

ur = ChunkRule('<DT><NN.*><.*>*<NN.*>', 'chunk determiners and nouns')
ur.apply(cs)
print(cs)

ir = ChinkRule('<VB.*>', 'chink verbs')
ir.apply(cs)
print(cs)

print(cs.to_chunkstruct())
# cs.to_chunkstruct().draw()

chunker = RegexpChunkParser([ur, ir])

コード例 #34

0

ファイルを表示

ファイル: 04_regex_grammars.py プロジェクト: ma0c/practical_nlp

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag
from nltk.chunk import RegexpParser

nltk.download('averaged_perceptron_tagger')

phrase = "I love Ice Cream. I also like steak"
tokenized_sentences = sent_tokenize(phrase)
tokenized_phrases = [
    word_tokenize(sentence) for sentence in tokenized_sentences
]

tagged_words = [pos_tag(phrase) for phrase in tokenized_phrases]
print(tagged_words)

grammar = r"""
NP: {<PRP|NN|NNP>}
"""

parser = RegexpParser(grammar)

results = [parser.parse(sentence) for sentence in tagged_words]
print(results)
results[0].draw()

コード例 #35

0

ファイルを表示

ファイル: nltk_hand)n.py プロジェクト: kingafy/My_codes_NLP

synonyms = []
for syn in wn.synsets('girl'):
    print(syn)
    for lemma in syn.lemmas(): #  A lemma is basically the dictionary form or base form of a word, as opposed to the various inflected forms of a word. 
        print(lemma)
        synonyms.append(lemma.name())
synonyms


antonyms = []
for syn in wn.synsets("girl"):
    for l in syn.lemmas():
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())
antonyms


###chunking####
from nltk import pos_tag
tags = pos_tag(tokens)
tags

from nltk.chunk import RegexpParser
grammar = "NP: {<DT>?<JJ>*<NN>}"
chunker = RegexpParser(grammar)
result = chunker.parse(tags)
result

chunker = RegexpParser(grammar)
result = chunker.parse(tags)
result

コード例 #36

0

ファイルを表示

ファイル: review1.py プロジェクト: abijith-kp/nlp-sentiment-analysis

#help(t)
sentCount = 1
sentScore = []          #tuple with (Subj-Obj , Verb-P , )
totalS = []

print "Processing input..."
print "Number of sentences to process: ", len(arr_pos)

for q in ["", vp, prd, cls1, cls2]:
   grammer += q
   npc = RegexpParser(grammer)
   print "\n\n"
   for i in arr_pos:
	print "Reading sentence ", sentCount
	sentCount += 1
	t = npc.parse(i)
	print t
	tmpVP = []
	tmpNP = []
	tmpPrd = []
	tmpCls = []
	x1 = ""
   
	for x in t:
	    try:
		if x.node == "VP":
	       #print x
		    x1 = addVerbPhrase(x)
		    tmpVP.append(x1)
	       
		if x.node == "NP":

コード例 #37

0

ファイルを表示

ファイル: shallow_parsing.py プロジェクト: santosh500/APL_PythonDeepLearning

test_data = data[4000:]
print train_data[7]

simple_sentence = 'the quick fox jumped over the lazy dog'

from nltk.chunk import RegexpParser
from pattern.en import tag

tagged_simple_sent = tag(simple_sentence)
print tagged_simple_sent

chunk_grammar = """
NP: {<DT>?<JJ>*<NN.*>}
"""
rc = RegexpParser(chunk_grammar)
c = rc.parse(tagged_simple_sent)
print c

chink_grammar = """
NP: {<.*>+} # chunk everything as NP
}<VBD|IN>+{
"""
rc = RegexpParser(chink_grammar)
c = rc.parse(tagged_simple_sent)
print c

tagged_sentence = tag(sentence)
print tagged_sentence

grammar = """
NP: {<DT>?<JJ>?<NN.*>}

コード例 #38

0

ファイルを表示

sentencas_treinadoras = mac_morpho.tagged_sents()[0:15000]
#Cria o UnigramTagger com base no etiquetador padrão e treina-o com as sentenças etiquetadas do mac_morpho
etiq = UnigramTagger(sentencas_treinadoras, backoff=etiqPadrao)

coment = str(input("Entre com o texto: "))
if coment == "default":
        coment = open("default.txt", "r").read().replace("\n", " ")
#O texto é convertido em tokens
tokens=nltk.word_tokenize(coment.lower())
#É etiquetada cada token do texto
tags = etiq.tag(tokens)

#É criado o analisador de expresões regulares contendo os padrões procurados
analiseGramatical = RegexpParser(r"""
		PADRAO7: {<N><ADJ>}
        PADRAO1: {<ADJ><N>(<PREP>?<N>)*}
        PADRAO2: {<ADV><ADV>?<ADJ>(<N>(<PREP>?<N>)*)?}
        PADRAO3: {<N>(<PREP>?<N>)*(<ADJ>)<ADV><ADV>?}
        PADRAO4: {<N>(<PREP>?<N>)*<ADV>?<ADJ>+}
        PADRAO5: {<ADV><V>}
        PADRAO6: {<V><ADV>}
		""")
#O analisador é então utilizado para a geração da árvore de padrões
arvore = analiseGramatical.parse(tags)
x = [ExtractPhrases(arvore, "PADRAO1"), ExtractPhrases(arvore, "PADRAO2"),
     ExtractPhrases(arvore, "PADRAO3"), ExtractPhrases(arvore, "PADRAO4"),
     ExtractPhrases(arvore, "PADRAO5"), ExtractPhrases(arvore, "PADRAO6"),
     ExtractPhrases(arvore, "PADRAO7")]
for aux in range(len(x)):
        print("PADRAO 0"+str(aux+1)+str(x[aux]))

コード例 #39

0

ファイルを表示

ファイル: plnTopX.py プロジェクト: serbarbosa/sentiment-pipeline

def GetPatternsTree(tagsList, pattern, patternName):
    gramaticalAnalyse = RegexpParser(pattern)
    tree = gramaticalAnalyse.parse(tagsList)
    patt = ExtractPhrases(tree, patternName)
    return patt

コード例 #40

0

ファイルを表示

ファイル: nlpcourse1_5hours.py プロジェクト: 1UC1F3R616/myNLP

from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import RegexpParser

tokenized_data = word_tokenize(dataset)
pos_tagging = pos_tag(tokenized_data)

chunk_sequence = """
chunk:
{<NNPS>+}
{<NNP>+}
{<NN>}"""

chunk = RegexpParser(chunk_sequence)

chunked_data = chunk.parse(pos_tagging)
print(chunked_data)
"""## Named Entity Recognition
- Also known as
  - Entity Identification
  - Entity Chunking
  - Entity Extraction
- It is a subtask of information extraction that classify named entities into pre-defined categories such as names of persons, organizations, locations
- Tesla: Organization, Elon Musk: Person

### Applications
- classify the contents to news providers
- Efficent search Algorithms
- Content recommendation
- Question and Answer systems
- Automatic Forwarding

コード例 #41

0

ファイルを表示

ファイル: Text_Preprocessing.py プロジェクト: devthedevil/Machine-learning-Lab

    ''.join(c for c in s if c not in string.punctuation)
    for s in sentence_token
]
sentence_token = [s for s in sentence_token if s]
print(sentence_token)


#POS Tagging, Chunking and N-grams
def extract_ngrams(data, num):
    n_grams = ngrams(word_tokenize(data), num)
    return [' '.join(grams) for grams in n_grams]


for t in sentence_token:
    #POS_Tagging
    print(t)
    wordsList = word_tokenize(t)
    pos_tagged = pos_tag(wordsList)
    print("After POS-Tagging\n")
    print(pos_tagged)

    #Chunking
    chunker = RegexpParser(r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}""")
    output = chunker.parse(tagged)
    print("After chunking", '\n')
    print(output)

    #3-grams
    print("3 grams : ")
    print(extract_ngrams(t, 3))

コード例 #42

0

ファイルを表示

ファイル: ner.py プロジェクト: HarshitBangar/Feed-Fetcher

    # Filter out strings with an invalid tag
    taggedArticle = [sanitizeTags(unsanitizedList)
            for unsanitizedList in taggedArticleUnsanitized]

    # Chunk and calculate frequency
    frequency = {}
    paraNumber = -1
    for para in taggedArticle:
        paraNumber += 1

        if not len(para):
            # Ignore empty paragraphs
            continue

        # Extract all subtrees tagged with the right identifier
        for subtree in chunker.parse(para).subtrees(
                filter = lambda x: x.node == 'Nouns'):

            # Concatenate member strings
            leafString = ' '.join(
                    [key.lower() for key, value in subtree.leaves()])

            # Get the increment value
            increment = 1
            if paraNumber == 0:
                increment = 3       # Title
            elif paraNumber == 1:
                increment = 2       # First paragraph

            # Increment the frequency of the current string
            if leafString in frequency:
                frequency[leafString] += increment