def output():
    if request.method == 'POST':
        result = request.form['inputdata']
        result = result.lower()
        #converting the paragraph in tokens for parts of speech tagging
        words = word_tokenize(result)
        allwords=[]
        for word in words:
            if word not in allwords:
                allwords.append(word)
        ##lm = WordNetLemmatizer()
        rootwords = []
        ##for word in allwords:
        ##   rootwords.append(lm.lemmatize(word))
        #tagged all the parts of speech to words
        taggedwords = pos_tag(allwords)
        #splits the string and gets favorable parts of speech tagged words... Here I have taken all Noun, adjectives, Verbs and Adverbs
        chunkString = """Junk*1234: {<.*>+}
                                }<NN*|JJ|VB.|RB>+{"""
        chunkParse = RegexpParser(chunkString)
        chunkedwords = chunkParse.parse(taggedwords)
        impwords = []
        ## coverting chunked words tree to list of favorable words
        for words in chunkedwords:
            impwords.append(str(words))
        keyvalue=[]
        for word in impwords:
            if ('Junk*1234' not in word):
                keyvalue.append(word[1:].split(",")[0])
        return render_template("output.html", keyvalues=keyvalue,result=result)
Esempio n. 2
0
def chunkingTweet(filtrd_tweet):
    trees = []
   # grammar = """ 
       # ADJN:{<JJ><N.*>*}
        #    VBN:{(<VB>|<VBG>|<VBD>|<VBN>|<VBP>|<VBZ>)(<NN>|<NNP>)}
         #   AVBN:{(<RB>|<RBR>|<RBS>)(<NN>|<NNP>)}
          #  VBAVB:{(<VB>|<VBG>|<VBD>|<VBN>|<VBP>)(<RB>|<RBR>|<RBS>)}
           # MDVB:{<MD><.+>(<VB>|<VBG>|<VBD>|<VBN>|<VBP>)}

   # """
    grammar = """ 
        ADJN:{<JJ><N.*>*}
            VBN:{(<VB>|<VBG>|<VBD>|<VBN>|<VBP>|<VBZ>)(<NN>|<NNP>)}
            AVBN:{(<RB>|<RBR>|<RBS>)(<NN>|<NNP>)}
            MDVB:{<MD><.+>(<VB>|<VBG>|<VBD>|<VBN>|<VBP>)}

    """
 
    chunkParser = REPARSE(grammar)
    rootTree = chunkParser.parse(filtrd_tweet)
    rootTree.draw()
    for tree in rootTree:
        if isinstance(tree,TREE.Tree):
            trees.append(tree)			
    chunkParser.parse(filtrd_tweet)
    return trees
Esempio n. 3
0
 def extraxt_semantic_chuncks(self, pos_tags):
     """ Extract chunks of text from the paper taking advantage of the parts of speech previously extracted.
     It uses a grammar
     Returns:
         chunks (list): list of all chunks of text 
     """
     grammar_parser = RegexpParser(GRAMMAR)
     chunks = list()
     pos_tags_with_grammar = grammar_parser.parse(pos_tags)
     #print(pos_tags_with_grammar)
     for node in pos_tags_with_grammar:
         if isinstance(node, tree.Tree) and node.label(
         ) == 'DBW_CONCEPT':  # if matches our grammar
             chunk = ''
             for leaf in node.leaves():
                 concept_chunk = leaf[0]
                 concept_chunk = re.sub(
                     '[\=\,\…\’\'\+\-\–\“\”\"\/\‘\[\]\®\™\%]', ' ',
                     concept_chunk)
                 concept_chunk = re.sub('\.$|^\.', '', concept_chunk)
                 concept_chunk = concept_chunk.lower().strip()
                 chunk += ' ' + concept_chunk
             chunk = re.sub('\.+', '.', chunk)
             chunk = re.sub('\s+', ' ', chunk)
             chunks.append(chunk)
     return chunks
Esempio n. 4
0
    def extract_entities(self, text, grammar=None, lang=None):
        """
        Extract entities from text
        """
        entities = []
        if lang is None:
            lang = WORLD_2_NLTK[self.detect_language(text)]
        else:
            if lang in WORLD_2_NLTK.keys():
                lang = WORLD_2_NLTK[lang]
            else:
                lang = self._lang
        if lang == 'japanese':
            return JAParser().extract_entities(text), lang
        pos_sentences = [
            pos_tag(self.word_tokenize(sentence, lang=lang))
            for sentence in self.sent_tokenize(text, lang=lang)
        ]

        if grammar is not None:
            chunker = RegexpParser(grammar)
            for pos_sentence in pos_sentences:
                tree = chunker.parse(pos_sentence)
                self.logger.debug(tree)
                entities = entities + self._select_entities(tree)
        else:
            for pos_sentence in pos_sentences:
                tree = ne_chunk(pos_sentence, binary=False)
                self.logger.debug(tree)
                entities = entities + self._select_entities(tree)
        return entities, lang
Esempio n. 5
0
def findNounPhrases(toRead, category):
    words = tagWords(toRead)

    #create file
    newFile = open(category.strip() + 'NounPhrases.txt', 'a')

    #? is include, if it exists
    #* is include it (however many)
    #+ is at least one
    patterns = r"""
 	NP:
     	{<VBD><N.*>+}
     	{<VBG><N.*>+}
     	{<VBN><N.*>+}


	"""
    chunker = RegexpParser(patterns)
    tree = chunker.parse(words)

    #subtree.leaves() returns list
    for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'):
        newFile.write("\n")
        newFile.write("\n")
        for leaf in subtree.leaves():
            newFile.write(str(leaf))

    newFile.close()
def extract_NP(posTagged):
    grammar = r"""

        ADJ:
            {<RB.*>? <JJ.* | VBG>}

        ADJLIST:
            {<ADJ> (<CC>? <,>? <ADJ>)*}

        ADJNOUN:
            {<ADJLIST>? <NN.*>+}

        PREFIXEDNOUN:
            {<DT|PRP\$>? (<ADJNOUN> <POS>)* <ADJNOUN>}

        PP:
            {<IN><PREFIXEDNOUN>}

        NP:
            {<PREFIXEDNOUN> (<PP>)*}
            {<PRP>}

        """
    chunker = RegexpParser(grammar)
    ne = []
    chunk = chunker.parse(posTagged)
    for tree in chunk.subtrees(filter=lambda t: t.label() == 'NP'):
        ne.append(' '.join([child[0] for child in tree.leaves()]))
    return ne
Esempio n. 7
0
def GetNounPhrase(sentence):

    #print('GetNounPhrase is called')
    output = ''

    #Parse either Proper Noun Singular or Noun because RegexpParser is inaccurate at times
    grammar = 'NP: {<DT>?<JJ>*<NN.*>+}'

    #Create the Parser Object
    cp = RegexpParser(grammar)

    #Tokenize the input and get part of speech
    pos = pos_tag(word_tokenize(sentence))

    result = cp.parse(pos)

    #for debugging
    #result.draw()
    #print(result)

    #Loop through the tree datastructure and pull the values under DNP node
    #we created for the result
    for subtree in result.subtrees(filter=lambda t: t.label() == 'NP'):
        output = ' '.join(item[0]
                          for item in subtree.leaves())  # 'abc\nghi\nmno'

    return output
def convert_to_noun(sen):
    sen = ie_preprocess(sen)
    grammar = r"""
    NP: {<DT|PP\$>?<JJ>*<NN>}   # chunk determiner/possessive, adjectives and noun
        {<NNP>+}                # chunk sequences of proper nouns
    """
    cp = RegexpParser(grammar)
    res = cp.parse(sen[0])
    print(res)
    ROOT = 'ROOT'
    tree = res
    output = []

    def getNodes(parent):
        for node in parent:
            if type(node) is Tree:
                print("Label:", node.label())
                print("Leaves:", node.leaves())
                if node.leaves()[0][1] in ("NN", "JJ"):
                    if node.leaves()[0][0] not in output:
                        output.append(node.leaves()[0][0])
                    print(node.leaves()[0][0])

                getNodes(node)
            else:
                print("Word:", node)
                if node[1] in ("NN", "JJ"):
                    if node[0] not in output:
                        output.append(node[0])

    getNodes(tree)
    print(output)
    return " ".join(output)
 def identify_map_verbs_with_negation(nlp, sentences):
     """Errors are happens when sentences are list type, hint: extracts string from the list"""
     sentences = sent_tokenize(sentences)
     grammar = r"""
               GR : {<RB>*<VB|VBN|JJ|VBG|VBZ|VBP|VBD>+<IN|RP>*} 
               """
     # GR : {<RB>*<VB|VBN|JJ|VBG|VBZ|VBP>+<IN|RP>*}
     map_part_verb_and_negation = []
     for sent in sentences:
         # sample = sent.split('=')
         words = word_tokenize(sent)
         tagged = pos_tag(words)
         cp = RegexpParser(grammar)
         t = cp.parse(tagged)
         # t.draw()
         negate = ''
         verb = ''
         verbs = []
         for s in t.subtrees():
             is_phrasal = False
             if s.label() == "GR":
                 for token in s.leaves():
                     if token[0] == 'is' or token[0] == 'are' or token[
                             0] == 'does' or token[0] == 'do':
                         continue
                     elif token[1] == 'RB':
                         negate = token[0]
                     elif token[0] != "=":
                         verb = verb + " " + token[0]
         verb = InputsOutputsStateFinder.phrasal_verb_verifier_or_verb_part_extractor(
             nlp, verb)
         verbs.append([negate, verb])
         map_part_verb_and_negation.append(verbs)
     return map_part_verb_and_negation.pop()
Esempio n. 10
0
 def drawNamedEntityTree(self, text):
     tokenized_text = tokenizer.tokenize(text)
     tagged_text = self.tagWords(tokenized_text)
     grammar = "ENT: {<PESSOA>*}"
     cp = RegexpParser(grammar)
     res = cp.parse(tagged_text)
     res.draw()
Esempio n. 11
0
def tokenise_subjects(subject):
    if subject == '' or subject is None:
        return []
    split_subjects = []
    phrase_pattern = 'CHUNK:{<JJ>*<NN.?>*<VBG>*}'
    phrase_chunker = RegexpParser(phrase_pattern)
    for s in subject.split(','):
        tokens = word_tokenize(s.strip().lower())
        tags = pos_tag(tokens)
        phrases = [
            ' '.join([leaf[0] for leaf in c.leaves()])
            for c in phrase_chunker.parse(tags)
            if hasattr(c, 'label') and c.label() == 'CHUNK'
        ]
        for phrase in phrases:
            phrase_tokens = word_tokenize(phrase)
            phrase_tags = pos_tag(phrase_tokens)
            lemmatised_phrase = []
            for pto, pta in phrase_tags:
                wn_tag = {
                    'n': wn.NOUN,
                    'j': wn.ADJ,
                    'v': wn.VERB,
                    'r': wn.ADV
                }.get(pta[0].lower(), None)
                if wn_tag is None:
                    continue
                lemmatised = WordNetLemmatizer().lemmatize(pto, wn_tag)
                lemmatised_phrase.append(lemmatised)
            if len(lemmatised_phrase) > 0:
                lemmatised_phrase = ' '.join(lemmatised_phrase)
                split_subjects.append(lemmatised_phrase)
    return list(set(split_subjects))
def pos_tagging(result):
    pos_tagged_words = []
    for tf_idf_info in result:
        tf_idf_info["pos"] = morph.parse(tf_idf_info["normal_form"])[0].tag.POS
        if tf_idf_info["pos"] is not None:
            pos_tagged_words.append(
                (tf_idf_info["normal_form"], tf_idf_info["pos"]))

    # ToDo: Add reg exps for numeric
    patterns = """
                    many adj+noun:{<ADJF>+<NOUN>}
                    noun+many adj:{<NOUN><ADJF>+}
                    verb + noun:{<INFN><NOUN>+}
                    verb + verb:{<INFN><INFN>}
                    prep + verb/noun:{<PRCL>(<INFN>|<NOUN>)} 
                    verb + prep + verb?:{<INFN><PRCL><INFN>?}
                    conj + verb/verb + conj:{(<INFN><CONJ>)|(<CONJ><INFN>)?}
               """

    chunker = RegexpParser(patterns)
    tree = chunker.parse(pos_tagged_words)

    for subtree in tree.subtrees():
        if subtree._label == "S":
            continue

        # highlight all words in collocation if one of them already was highlighted
        # TODO: Iterate through all elements of subtree (it might be > 2)
        term1, term2 = subtree[0][0], subtree[1][0]
        tf_idf_info1, tf_idf_info2 = next(x for x in result if x["normal_form"] == term1), \
                                     next(x for x in result if x["normal_form"] == term2)

        if tf_idf_info1["highlight"] or tf_idf_info2["highlight"]:
            tf_idf_info1["highlight"], tf_idf_info2["highlight"] = True, True
Esempio n. 13
0
def nounphrase2(abstract):
    ab = abstract.lower()
    ab_words = [word_tokenize(s) for s in sent_tokenize(ab)]
    pos_tagged = []
    for sent in ab_words:
        pos_tagged.append(pos_tag(sent))
    chunk_grammar = "NP: {<VB.*><DT>?<JJ>*<NN><RB.?>?}"
    #chunk_grammar = "NP: {<DT>?<JJ>*<NN>}"
    #chunk_grammar = "NP: {<DT>?<JJ>*<NN><VB.*><RB.?>?}"
    chunk_parser = RegexpParser(chunk_grammar)
    np_chunked = []
    for sentence in pos_tagged:
        np_chunked.append(chunk_parser.parse(sentence))
    most_com = np_chunk_counter(np_chunked)

    truelab = lambda row: row.true_label4 if row.Abstract == abstract else None
    p = df.apply(truelab, axis=1)
    if p.dropna().values[0] == 1.0:
        print('\n')
        print(p.dropna())
        print(most_com)
        print('\n')
    count = 0
    # print(most_com)
    #print('\n')
    # print("count2",count)
    if count >= 2:
        return 1
    elif count == 1:
        return 0
    elif count <= 0:
        return 2
Esempio n. 14
0
def GetName(sentence):
    
    #Parse either Proper Noun Singular or Noun because RegexpParser is inaccurate at times
    grammar = 'NAME: {<NNP>*|<NN?>*}'
    
    #Create the Parser Object
    cp = RegexpParser(grammar)
    
    common_words = {'hi', 'name', 'hello', 'thank', 'you', 'i', 'am', 'oh', 'hey', 'sure', 'yes', 'named', 'known'}
    
    #Tokenize the input
    word_tokens = word_tokenize(sentence)
    
    #Eliminate the greeting words and get straight to discerning the Name as NNP or NN
    word_tokens = [x for x in word_tokens if x.lower() not in common_words]
    
    #Obtain parts of speech for each token and run through parser
    pos = pos_tag(word_tokens)
    result = cp.parse(pos)
    
    #print statements for debugging 
    #print(result)
    #result.draw()
    
    #Loop through the tree datastructure and pull the x (actual name), if the Root is 'NAME'
    #we created for the result
    
    output = "" 
    for tree in result.subtrees():
        if tree.label() == 'NAME':
            name_match = ' '.join([x for x,y in tree.leaves()])
            output = output + ' ' + name_match
    
    return output.replace("  ", " ").strip()
Esempio n. 15
0
class Chunking:
    def __init__(self, formats: list):
        self.__text = "\n".join(formats)
        self.__parse: RegexpParser = RegexpParser(self.__text)

    def setParser(self, formats):
        if type(formats) is list:
            self.__text = "\n".join(formats)
        else:
            self.__text = formats
        self.__parse = RegexpParser(self.__text)

    def getParse(self) -> RegexpParser:
        return self.__parse

    def addChunking(self, s: str):
        self.__text += "\n".join(s)
        self.__parse: RegexpParser = RegexpParser(self.__text)

    def parse(self, tokens) -> Tree:
        return self.__parse.parse(tokens)

    def merge(self, tokens):
        chunks = self.__parse.parse(tokens)
        ret = []
        for N in chunks:
            if type(N) is Tree:
                label = N.label()
                text = " ".join([T[0] for T in N.leaves()])
                ret.append((text, label))
            else:
                ret.append(N)
        return ret
Esempio n. 16
0
def post_text_process(text):
    """
      Performs a post process with the text, preparing it for text deep analysis
      or other purposes.
    """
    # Splits words to tokens
    tokenized_word = word_tokenize(text.lower())

    # Normalize words to normal form
    # aka: playing -> play
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = []
    for word in tokenized_word:
        lemmatized_words.append(lemmatizer.lemmatize(word))

    # Remove stopwords from the set
    stopwords_set = set(stopwords.words('english'))
    stopwords_set.add('.')
    stopwords_text = [
        word for word in lemmatized_words if word not in stopwords_set
    ]

    # Named entity recognition
    tags = nltk.pos_tag(stopwords_text)
    chunk = ne_chunk(tags)

    chunking_rule = "NP: {<DT>?<JJ>*<NN>}"
    chunking_text_parsed = RegexpParser(chunking_rule)
    chunking_result = chunking_text_parsed.parse(tags)

    # Returns important information created by the post processing
    return chunking_result, chunk
Esempio n. 17
0
 def on_get(self, req, resp, id):
     print(id)
     if (len(id) > 0):
         arts_obj = ArticleModel.objects(_id=ObjectId(id))
         art = arts_obj[0]
         print(art['href'])
         title = art['title']
         toks = word_tokenize(title)
         sent = sent_tokenize(art['txt'])
         sent = [word_tokenize(xt) for xt in sent]
         sent = [pos_tag(xt) for xt in sent]
         print(sent)
         tag = pos_tag(toks)
         grammar = "NP: {<DT>?<JJ>*<NN>}"
         patterns = """mychunk:{<NN.?>*<VBD.?>*<JJ.?>*<CC>?}"""
         cp = RegexpParser(patterns)
         rslt = cp.parse(tag)
         print(rslt)
         resp.json = {'rslt': str(rslt)}
     else:
         #resp.status = falcon.HTTP_200
         #arts = []
         #arts_obj = ArticleModel.objects().all_fields()
         #for art in arts_obj:
         #print(art.to_json())
         #    arts.append(art.to_json())
         callnames = ['tst']
         resp.json = {'rslt': json.dumps(callnames)}
Esempio n. 18
0
def process_command(command):
    try:
        words = word_tokenize(command)
        tagged = pos_tag(words)
        chunkGram = r"""
					Tasks: {<VB.?>}
					Numbers:{<CD>}
					"""
        chunkParser = RegexpParser(chunkGram)
        chunked = chunkParser.parse(tagged)

        traverse(chunked)
        #chunked.draw()

        if (len(numbers) > 2):
            speak(to_many_numbers)
            return
        elif (len(numbers) < 2):
            speak(to_less_numbers)
            return
        if (possible_tasks.isdisjoint(tasks)):
            speak(unknown_task)
            return

        if ('add' in tasks):
            add(numbers[0], numbers[1])
        elif ('subtract' in tasks):
            sub(numbers[0], numbers[1])
        elif ('multiply' in tasks):
            mul(numbers[0], numbers[1])
        elif ('divide' in tasks):
            div(numbers[0], numbers[1])

    except Exception as e:
        print(str(e))
Esempio n. 19
0
    def __init__(self, grammar_filename: Optional[str] = None) -> None:
        """Helper class to chunk part-of-speech tagged text using grammar regular expressions.

        The default grammar regular expressions file is defined by the `FILE_GRAMMAR` variable in
        `src.make_feedback_tool_data.text_chunking`.
        >>> from src.make_feedback_tool_data.text_chunking import FIlE_GRAMMAR
        >>> FIlE_GRAMMAR
        '.../govuk-corona-analysis/src/make_feedback_tool_data/grammar.txt'

        :param grammar_filename: Default: None. A path string to file containing regular expression grammar patterns
            usable by the `grammar` argument of the nltk.chunk.regexp.RegexpParser class. For each grammar type,
            each pattern should be listed on a separate line, and in descending order of priority (highest first). If
            None, it will use the default regular expression file.

        """
        self.logger = logging.getLogger(__name__)

        # If `grammar_filename` is None, use the default file path
        if not grammar_filename:
            grammar_filename = FIlE_GRAMMAR

        # Load the regular expressions from `grammar_filename`
        self.grammar = self._load_grammar_from_file(grammar_filename)

        # Initialise a nltk.RegexpParser object using `self.grammar`
        self.logger.info("Initializing parser...")
        self.parser = RegexpParser(self.grammar)
Esempio n. 20
0
def get_noun_counter(text) -> collections.Counter: 

    text = text.split()
    tokens_tag = pos_tag(text)
    patterns= """mychunk:{<JJ.?>*<NN.?.?>*}"""
    chunker = RegexpParser(patterns)
    output = chunker.parse(tokens_tag)

    noun_list = []
    compound_noun_list = []
    for n in output:
        if isinstance(n, nltk.tree.Tree):
            n = str(n)
            part_of_speech = [el.split('/')[1]for el in n.split()[1:]]
            if any([el.find('NN')>-1 for el in part_of_speech]):
                noun = [
                        stemmer.stem(el.split('/')[0])
                        if el.split('/')[1] == 'NNS' or el.split('/')[1] == 'NNPS' 
                        else el.split('/')[0] 
                        for el in n.split()[1:]
                    ]
                compound_noun_list.append(''.join([ f'{n} ' for n in noun ])[:-1])
                noun_list.extend(noun)

    noun_list = [ noun for noun in noun_list if len(noun) > 1]

    return collections.Counter(noun_list), compound_noun_list
Esempio n. 21
0
def data_gathering_iterator(file_path, morph, grammar=COMPLEX_GRAMMAR):
    """На каждой итерации возвращает список полученных из одной строки комбинаций прилаг + сущ.
    Элемент списка кортеж (прилагательное, существительно).
    Прилагательное и существительное приведены к нормальной форме

    :param file_path: путь до файла с данными. Файл должен быть в формате UTF-8
    :param morph: морфология
    """
    chunk_parser = RegexpParser(grammar)
    f = open(file_path, "r")

    for line in f:
        try:
            line = line.decode('utf-8')
        except UnicodeDecodeError:
            continue
        line = line.strip()
        #разбиваем на предложения
        for sentence in sent_tokenize(line):
            sentence = sentence.strip()
            if sentence:
                tokens = word_tokenize(sentence)

                tagged_tokens = pos_tag(tokens)
                tree = chunk_parser.parse(tagged_tokens)
                for subtree in tree.subtrees():
                    if subtree.node == u"CHUNK":
                        adj_noun_list = get_adj_noun_list_from_chunk(subtree)
                        yield normilize_adj_noun_list(adj_noun_list, morph)
def run(posTaggedTokenListList, pos1, pos2):
    retVal = ''
    total = 0

    # word pair 정규식을 정의한다.
    regex = ('pattern: {<%s><%s>}' %
             (getPosTagRegex(pos1), getPosTagRegex(pos2)))

    # 파서를 생성한다.
    parser = RegexpParser(regex)

    # 파싱한다.
    for posTaggedTokenList in posTaggedTokenListList:
        parsedTree = parser.parse(posTaggedTokenList)

        for subtree in parsedTree:
            if isinstance(subtree, tree.Tree):
                retVal += (subtree[0][0] + " ")
                retVal += (subtree[1][0] + "\r\n")
                total += 1

    retVal = (("total: %d\r\npos1: %s, pos2: %s\r\n\r\n" %
               (total, pos1, pos2)) + retVal)

    return retVal
Esempio n. 23
0
def GetVerbDetNounPhrase(sentence):

    #print('GetNounPhrase is called')
    output = ''

    #Parse either Proper Noun Singular or Noun because RegexpParser is inaccurate at times
    grammar = 'DNP: {<(VB |VBP)><DT>?<NN>}'

    #Create the Parser Object
    cp = RegexpParser(grammar)

    #Tokenize the input and get part of speech
    pos = pos_tag(word_tokenize(sentence))

    result = cp.parse(pos)

    #result.draw()
    #print(result)

    #Loop through the tree datastructure and pull the values under DNP node
    #we created for the result
    for tree in result.subtrees():
        if tree.label() == 'DNP':
            name_match = ' '.join([x for x, y in tree.leaves()])
            output = output + name_match

    return output
Esempio n. 24
0
	def recuperarEntidadesEs(self,texto):
		chunker = RegexpParser("""
		ENTI:
		    {<NNP|NNPS>+<NNP|NNPS|NN|NNS>} 
		    {<NN|NNS>+<NN|NNS><JJ>} 
		    {<NNP|NNPS><IN|DT><NNP|NNPS|NN|NNS>}
		    {<NN|NNS><JJ>|<JJ><NN|NNS>}
		    {<NNP|NNPS>}
		ENTIDACOMP:
			{<NN|NNS><ENTI>}
			{<NN|NNS><IN><ENTI>}
			{<ENTI>(<IN>|<IN><DT>)<ENTI|NN|NNS>}
			{<ENTI|ENTIDACOMP><JJ><IN><ENTI|ENTIDACOMP>}
		    {<ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP>}
		    {<ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP>}
		ENTIDACOMP2:
			{<ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP>}
		FECHA:
			{<LS|CD><IN><ENTI><DT><LS|CD>}
			{<LS|CD><IN><ENTI>}
			{<ENTI><DT><LS|CD>}
			{<ENTI><LS|CD>}
		""")
		'''chunker = RegexpParser("""
		ENTI:
		    {<NNP|NNPS>+<NNP|NNPS|NN|NNS>} 
		    {<NN|NNS>+<NN|NNS><JJ>} 
		    {<NNP|NNPS><IN|DT><NNP|NNPS|NN|NNS>}
		    {<NN|NNS><JJ>|<JJ><NN|NNS>}
		    {<NNP|NNPS>}
		ENTIDACOMP:
			{<DT><NN|NNS><ENTI>}
			{<DT><NN|NNS><IN><ENTI>}
			{<ENTI>(<IN>|<IN><DT>)<ENTI|NN|NNS>}
			{<ENTI|ENTIDACOMP><JJ><IN><ENTI|ENTIDACOMP>}
		    {<ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP>}
		    {<ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP>}
		ENTIDACOMP2:
			{<ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP>}
		FECHA:
			{<LS|CD><IN><ENTI><DT><LS|CD>}
			{<LS|CD><IN><ENTI>}
			{<ENTI><DT><LS|CD>}
			{<ENTI><LS|CD>}
		""")'''
		ObjTag = Tokenizar()
		Lista = []
		Lista2 = []
		for sentence in sent_tokenize(texto):
			tags=ObjTag.tagear(sentence)

			tagsentX=word_tokenize(sentence)
			filtered_words = ' '.join(w for w in tagsentX if not w in nltk.corpus.stopwords.words('spanish'))
			parsed = chunker.parse(tags)
			for chunk in parsed:
				if hasattr(chunk, 'node'):
					Lista2.append([chunk.leaves(),filtered_words])
					Lista.append (' '.join(c[0] for c in chunk.leaves()))
		return Lista2
def add_chunk_data(pos_data):
    chunker = RegexpParser(CHUNKER_GRAMMAR)
    chunks = {} 
    for sentence_id, data in pos_data.iteritems():
        result = chunker.parse(data)
        chunks[sentence_id] = {}
        chunks[sentence_id]['chunks'] = [' '.join([token for token, pos in t.leaves()]) for t in result.subtrees(lambda result: result.label() == 'SN')]
    return chunks
 def get_continuous_chunks(tokenized_text):
     # this regex is not working, change to another later
     NP = "(?:(?:\w+ ART)?(?:\w+ ADJ) *)?\w + (?:N[NP] | PRN)"
     chunker = RegexpParser(NP)
     tagged_text = PortugueseTextualProcessing.postag(tokenized_text,
                                                      as_list=False)
     chunked = chunker.parse(tagged_text)
     return PortugueseTextualProcessing().extract_chunks(chunked)
Esempio n. 27
0
def VBD_question(tagged):
    """
    Tries to leverage prepositions to generation questions.
    """
    try:
        first_verb_index = next(i for i, pair in enumerate(tagged) if (pair[1] == 'VBZ' or pair[1] == 'VBP'))
        subject_phrase = [pair[0] for pair in tagged[:first_verb_index+1]]
        phrase_dict = {'VBDP': 'VBDP: {<VBD|VBP|VBN><RB|JJ>*<IN>}'}
        vbdp_fragments = []
        for i, (key, phrase) in enumerate(phrase_dict.items()):
            cp = RegexpParser(phrase)
            if i==0:
                result = cp.parse(tagged)
            else:
                result = cp.parse(result)

        for i, item in enumerate(result):
            if type(item) is nltk.Tree:
                fragment = [pair[0] for pair in item]
                if item.node == 'VBDP':
                    vbdp_fragments.append((fragment, i))

        qa_list = []
        for vbdp, index in vbdp_fragments:
            question_list = subject_phrase + vbdp
            question_list.append('what?')
            question_string = ''.join([('' if c in string.punctuation else ' ')+c for c in question_list]).strip()
            sentence_remainder = result[index+1:]
            sentence_remainder_treeless = []
            for tree_or_tuple in sentence_remainder:
                try: 
                    tree_or_tuple.leaves()
                    for leaf in tree_or_tuple.leaves():
                        sentence_remainder_treeless.append(leaf)
                except AttributeError:
                    sentence_remainder_treeless.append(tree_or_tuple)


            answer_list = [pair[0] for pair in sentence_remainder_treeless]
            answer_string = ''.join([('' if c in string.punctuation else ' ')+c for c in answer_list]).strip()

            qa_list.append((question_string, answer_string))

        return qa_list
    except:
        """
        If not verb recognized above, simply split sentence based on prepositions.
        """
        prep_indices = [i for i, pair in enumerate(tagged) if pair[1] == 'IN']
        qa_list = []
        for prep_index in prep_indices:
            question_list = [pair[0] for pair in tagged[:prep_index+1]]
            question_list.append('what?')
            question_string = ''.join([('' if c in string.punctuation else ' ')+c for c in question_list]).strip()
            answer_list = [pair[0] for pair in tagged[prep_index+1:]]
            answer_string = ''.join([('' if c in string.punctuation else ' ')+c for c in answer_list]).strip()
            qa_list.append((question_string, answer_string))
        return qa_list
Esempio n. 28
0
def tagtosem(sent):
    cp = RegexpParser('''
        NP: {<DET>? (<ADJ>|<ADV>)* <CONJ>* (<NOUN>|<NUM>|<X>|(<PRON> <PRT>))* <PRON>?}
        R:  {(<PRT> <VERB>?)* <A..>* <PRON>?}
        V:  {<VERB>*(<PRT>*|<VERB>)*}
        PNC:{<\.>}
        C:  {<ADP>}
        ''')
    return cp.parse(sent)
Esempio n. 29
0
def extract_bow_from_raw_text(text_as_string):
    """Extracts bag-of-words from a raw text string.

    Parameters
    ----------
    text (str): a text document given as a string

    Returns
    -------
    list : the list of the tokens extracted and filtered from the text
    """
    if (text_as_string == None):
        return []

    if (len(text_as_string) < 1):
        return []

    import nltk
    if '/home/hadoop/nltk_data' not in nltk.data.path:
        nltk.data.path.append('/home/hadoop/nltk_data')

    nfkd_form = unicodedata.normalize('NFKD', unicode(text_as_string))
    text_input = nfkd_form.encode('ASCII', 'ignore')

    sent_tokens = sent_tokenize(text_input)

    tokens = map(word_tokenize, sent_tokens)

    sent_tags = map(pos_tag, tokens)

    grammar = r"""
        SENT: {<(J|N).*>}                # chunk sequences of proper nouns
    """

    cp = RegexpParser(grammar)
    ret_tokens = list()
    stemmer_snowball = SnowballStemmer('english')

    for sent in sent_tags:
        tree = cp.parse(sent)
        for subtree in tree.subtrees():
            if subtree.label() == 'SENT':
                t_tokenlist = [tpos[0].lower() for tpos in subtree.leaves()]
                t_tokens_stemsnowball = map(stemmer_snowball.stem, t_tokenlist)
                #t_token = "-".join(t_tokens_stemsnowball)
                #ret_tokens.append(t_token)
                ret_tokens.extend(t_tokens_stemsnowball)
            #if subtree.label() == 'V2V': print(subtree)
    #tokens_lower = [map(string.lower, sent) for sent in tokens]

    stop_words = {'book', 'author', 'read', "'", 'character',
                  ''}.union(ENGLISH_STOP_WORDS)

    tokens = [token for token in ret_tokens if token not in stop_words]

    return (tokens)
def get_chunks(tagged_sent):
    chunkgram = r"""VB-Phrase: {<DT><,>*<VB>}
                    VB-Phrase: {<RB><VB>}
                    VB-Phrase: {<UH><,>*<VB>}
                    VB-Phrase: {<UH><,><VBP>}
                    VB-Phrase: {<PRP><VB>}
                    VB-Phrase: {<NN.?>+<,>*<VB>}
                    Q-Tag: {<,><MD><RB>*<PRP><.>*}"""
    chunkparser = RegexpParser(chunkgram)
    return chunkparser.parse(tagged_sent)
Esempio n. 31
0
def pos_chunks():
    text = "learn php from guru99 and make study easy".split()
    print("After Split: ", text)
    tokens_tag = pos_tag(text)
    print("After Token: ", tokens_tag)
    patterns = """mychunk: {<NN.?>*<VBD.?>*<JJ.?>*<CC>?}"""
    chunker = RegexpParser(patterns)
    print("After Regex: ", chunker)
    output = chunker.parse(tokens_tag)
    print("After Chunking: ", output)
Esempio n. 32
0
def findRelationshipUsingGrammer(phrase):
    phrase = tag(phrase)
    grammer ='REL: {<RB><RBR><IN>|' \
                 '<RB><JJ|JJR|JJS><IN>|' \
                 '<JJ|JJR|JJS><IN>|' \
                 '<JJ|JJR|JJS>|' \
                 '<JJ|JJR|JJS><TO>}'
    parseTree = RegexpParser(grammer).parse(phrase)
    for i in parseTree.subtrees(filter=lambda x: x.label() == 'REL'):
        return ' '.join([ k[0] for k in list(i)])
Esempio n. 33
0
def getChunk(question):
    """
    helper method to get NP
    """
    qPOS = pos_tag(word_tokenize(question))
    t = ne_chunk(qPOS)
    Pattern = "NP:{<DT>?<JJ|PR.>*<NN|NNS>}"
    np_parser = RegexpParser(Pattern)
    T = np_parser.parse(t)
    return T
Esempio n. 34
0
    def _word_combination(self, pos_tagged_sentence):

        # Finding entities still testing
        grammar = r"""
        EN: {<NN.*><CD>+}
        """
        cp = RegexpParser(grammar)
        result = cp.parse(pos_tagged_sentence)

        return result
Esempio n. 35
0
def extract_verbphrase(tagged_sent):
    chunkgram = r"""VB-Phrase: {<UH><,>*<VB>}
                    VB-Phrase: {<UH><,><VBP>}
                    VB-Phrase: {<PRP><VB>}
                    VB-Phrase: {<NN.?>+<,>*<VB>}
                    VB-Phrase: {<DT><,>*<VB>}
                    VB-Phrase: {<RB><VB>}
                    Q-Tag: {<,><MD><RB>*<PRP><.>*}"""
    vbchunkparser = RegexpParser(chunkgram)
    return vbchunkparser.parse(tagged_sent)
Esempio n. 36
0
    def _word_combination(self, pos_tagged_sentence):

        # Finding entities still testing
        grammar = r"""
        EN: {<NN.*><CD>+}
        """
        cp = RegexpParser(grammar)
        result = cp.parse(pos_tagged_sentence)

        return result
 def test_tag_pattern2re_pattern_quantifier(self):
     """Test for bug https://github.com/nltk/nltk/issues/1597
 
     Ensures that curly bracket quantifiers can be used inside a chunk rule.
     This type of quantifier has been used for the supplementary example
     in http://www.nltk.org/book/ch07.html#exploring-text-corpora.
     """    
     sent = [('The', 'AT'), ('September-October', 'NP'), ('term', 'NN'), ('jury', 'NN'), ('had', 'HVD'), ('been', 'BEN'), ('charged', 'VBN'), ('by', 'IN'), ('Fulton', 'NP-TL'), ('Superior', 'JJ-TL'), ('Court', 'NN-TL'), ('Judge', 'NN-TL'), ('Durwood', 'NP'), ('Pye', 'NP'), ('to', 'TO'), ('investigate', 'VB'), ('reports', 'NNS'), ('of', 'IN'), ('possible', 'JJ'), ('``', '``'), ('irregularities', 'NNS'), ("''", "''"), ('in', 'IN'), ('the', 'AT'), ('hard-fought', 'JJ'), ('primary', 'NN'), ('which', 'WDT'), ('was', 'BEDZ'), ('won', 'VBN'), ('by', 'IN'), ('Mayor-nominate', 'NN-TL'), ('Ivan', 'NP'), ('Allen', 'NP'), ('Jr.', 'NP'), ('.', '.')] # source: brown corpus
     cp = RegexpParser('CHUNK: {<N.*>{4,}}')
     tree = cp.parse(sent)
     assert tree.pformat() == """(S
def filter_sentences_by_chunk(pos_data, tokens):
    chunker = RegexpParser(CHUNKER_GRAMMAR)
    filtered = {}
    for sentence_id, data in pos_data.iteritems():
        result = chunker.parse(data)
        good_one = False
        if 'CHUNK' in [s.label() for s in result.subtrees()]:
            for t in result.subtrees(lambda result: result.label() == 'CHUNK'):
                for token, pos in t.leaves():
                    if pos.find('VER') != -1 and token in tokens: good_one = True
                if good_one:
                    filtered[sentence_id] = ' '.join(item[0] for item in data)
    return filtered
Esempio n. 39
0
    def word_combination(self, pos_tagged_sentence):
        """Chunking of a part of speech tagged sentence based on specific grammar"""
        # grammar = r"""
        # EN:{(<JJ>*<NN.*>+<IN>)?<JJ>*<NN.*>+}
        # """

        # Previous one
        grammar = r"""
        EN: {<JJ.*>*<NN.*>+}
        """

        cp = RegexpParser(grammar)
        result = cp.parse(pos_tagged_sentence)
        return result
Esempio n. 40
0
def recuperarEntidades(texto):
	chunker = RegexpParser("""
	ENTI:
	    {<NNP|NNPS>+<NNP|NNPS|NN|NNS>}  # Nouns and Adjectives, terminated with Nouns
	    {<NN|NNS>+<NN|NNS><JJ>} 
	    {<NNP|NNPS><IN|DT><NNP|NNPS|NN|NNS>}
	    {(<NN|NNS><JJ>)|<JJ><NN|NNS>}
	    {<NNP|NNPS>}
	ENTIDACOMP:
		{<DT><NN|NNS><ENTI>}
		{<DT><NN|NNS><IN><ENTI>}
		{<ENTI>(<IN>|<IN><DT>)<ENTI|NN|NNS>}
		{<ENTI|ENTIDACOMP><JJ><IN><ENTI|ENTIDACOMP>}
	    {<ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP>}  # Above, connected with in/of/etc...
	    {<ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP>}
	ENTIDACOMP2:
		{<ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP>}
	FECHA:
		{<LS|CD><IN><ENTI><DT><LS|CD>}
		{<LS|CD><IN><ENTI>}
		{<ENTI><DT><LS|CD>}
		{<ENTI><LS|CD>}


	""")
	Lista = []
	for sentence in sent_tokenize(texto):
		#print sentence  
		tags=tagear(sentence)
		#tags=tagear(traducir(word_tokenize(sentence)))
		#print tags
		parsed = chunker.parse(tags)
		#print parsed
		for chunk in parsed:
			#print chunk
			#if hasattr(chunk, 'node'):
			#	print chunk.node
			if hasattr(chunk, 'node'):
				#	print chunk	
				#print chunk.leaves()
				#print ' '.join(c[0] for c in chunk.leaves())
				Lista.append (' '.join(c[0] for c in chunk.leaves()))
	Lista = []
	tags=tagear(sentence)
	parsed = chunker.parse(tags)
	for chunk in parsed:
		if hasattr(chunk, 'node'):
			Lista.append (' '.join(c[0] for c in chunk.leaves()))

	return Lista
Esempio n. 41
0
    def grammar_selection(self, grammar=None):
        """ Select candidates using nltk RegexpParser with a grammar defining
            noun phrases (NP).

            Args:
                grammar (str): grammar defining POS patterns of NPs.
        """

        # initialize default grammar if none provided
        if grammar is None:
            grammar = r"""
                NBAR:
                    {<NN.*|JJ>*<NN.*>} 
                    
                NP:
                    {<NBAR>}
                    {<NBAR><IN><NBAR>}
            """

        # initialize chunker
        chunker = RegexpParser(grammar)

        # loop through the sentences
        for i, sentence in enumerate(self.sentences):

            # compute the offset shift for the sentence
            shift = sum([s.length for s in self.sentences[0:i]])

            # convert sentence as list of (offset, pos) tuples
            tuples = [(str(j), sentence.pos[j]) for j in range(sentence.length)]

            # parse sentence
            tree = chunker.parse(tuples)

            # find candidates
            for subtree in tree.subtrees():
                if subtree.label() == 'NP':
                    leaves = subtree.leaves()

                    # get the first and lest offset of the current candidate
                    first = int(leaves[0][0])
                    last = int(leaves[-1][0])

                    # add the NP to the candidate container
                    self.add_candidate(words=sentence.words[first:last+1],
                                       stems=sentence.stems[first:last+1],
                                       pos=sentence.pos[first:last+1],
                                       offset=shift+first,
                                       sentence_id=i)
Esempio n. 42
0
 def __init__(self):
    grammar = r"""
       NP: {<DT>?<JJ.*|CD>*<NN.*>+}
       NP: {<NP><of><NP>}            # need to change tags of "of" to <of>!!
       NP: {<NP><in><NP>}            # need to change tags of "of" to <of>!!
    """
    self.parser = RegexpParser(grammar)
def parse(query_text, networks_json):
    query_text = preprocess(query_text)
    tokens = word_tokenize(query_text)
    double_tokens = [ (w, w) for w in tokens ]
    wg = word_grammar()
    w_cp = RegexpParser(compile_grammar(wg))
    word_result = w_cp.parse(double_tokens)
    word_result = convert_dates(word_result)
    new_tokens = list(zip(*(word_result.leaves()))[0])
    tagged = pos_tag(new_tokens)
    domain_tagged = tag_domains(tagged, networks_json)
    tg = tag_grammar()
    t_cp = RegexpParser(compile_grammar(tg))
    tagged_result = t_cp.parse(domain_tagged)
    slots = assign_slots(new_tokens, tagged_result, word_result)
    interpreted_input = make_sentence(slots)
    print 'tagged-result = ',tagged_result
    print 'word-result = ',word_result
    return {"parse":slots, "interpreted":interpreted_input}
Esempio n. 44
0
 def __init__(self, word_tokenize=None, sent_tokenize=None,
              pos_tag=None, stop_words=None, punct=None,
              grammar=chunk_grammar_propernouns):
     self._word_tokenize = word_tokenize if word_tokenize else nltk.word_tokenize
     self._sent_tokenize = sent_tokenize if sent_tokenize else nltk.sent_tokenize
     self._pos_tag = pos_tag if pos_tag else nltk.pos_tag
     self._stop_words = stop_words if stop_words else set(nltk.corpus.stopwords.words('english'))
     self._punct = punct if punct else set(string.punctuation)
     self._chunk_grammar = grammar
     self._chunker = RegexpParser(self._chunk_grammar)
Esempio n. 45
0
def chunkingList(dataS, chunkgram):
    """
    This function will find the chunk
    """
    #data = str(dataS)
    words = word_tokenize(str(dataS)[1:])
    #print words
    ps = pos_tag(words)

#    print ps
    
    chunkParser = RegexpParser(chunkgram)
    chunked = chunkParser.parse(ps)
    #print chunked
    tree = Tree('s', chunked)
    docs = []
    for subtree in tree.subtrees(filter=lambda t: t.label() == 'Chunk'):
        # Assemble the chunk into one line and strip extra punctuations
        docs.append(" ".join([a for (a,b) in subtree.leaves()]))
    return docs
Esempio n. 46
0
    def word_combination(pos_tagged_sentence, tag_set='ptb'):
        """Chunking of a part of speech tagged sentence based on specific grammar"""
        # grammar = r"""
        # EN:{(<JJ>*<NN.*>+<IN>)?<JJ>*<NN.*>+}
        # """
        if tag_set == 'ptb':
            # Entity grammar used for the Penn Tree Bank Tagset
            grammar = r"""
            EN: {<JJ.*>*<NN.*>+}
            """
        elif tag_set == 'universal':
            # Entity grammar used for the Universal Tagset
            grammar = r"""
            EN: {<ADJ>*<NOUN>+}
            """
        else:
            raise SyntaxError

        cp = RegexpParser(grammar)
        result = cp.parse(pos_tagged_sentence)
        return result
Esempio n. 47
0
def relationships_of(string):
  # relationship data is stored in a parenthetical immediately after the end of the </font> tag in the bio
  # e.g. "(son of Joseph Patrick Kennedy, II, and great-nephew of Edward Moore Kennedy and John Fitzgerald Kennedy)"
  pattern = "^\((.*?)\)"
  match = re.search(pattern, string, re.I)

  relationships = []
  
  if match and len(match.groups()) > 0:
    relationship_text = match.group(1).encode("ascii", "replace")

    # since some relationships refer to multiple people--great-nephew of Edward Moore Kennedy AND John Fitzgerald Kennedy--we need a special grammar
    from nltk import tree, pos_tag, RegexpParser
    tokens = re.split("[ ,;]+|-(?![0-9])", relationship_text)
    pos = pos_tag(tokens)

    grammar = r"""
      NAME: {<NNP>+}
      NAMES: { <IN><NAME>(?:<CC><NAME>)* }
      RELATIONSHIP: { <JJ|NN|RB|VB|VBD|VBN|IN|PRP\$>+ }
      MATCH: { <RELATIONSHIP><NAMES> }
      """
    cp = RegexpParser(grammar)   
    chunks = cp.parse(pos)

    # iterate through the Relationship/Names pairs
    for n in chunks:
      if isinstance(n, tree.Tree) and n.node == "MATCH":
        people = []
        relationship = None
        for piece in n:
          if piece.node == "RELATIONSHIP":
            relationship = " ".join([x[0] for x in piece])
          elif piece.node == "NAMES":
            for name in [x for x in piece if isinstance(x, tree.Tree)]:
              people.append(" ".join([x[0] for x in name]))
        for person in people:
          relationships.append({ "relation": relationship, "name": person})
  return relationships
    def setup_extractor(self):
        self.splitter = PunktSentenceSplitter(self.language)
        grammar = self.grammars.get(self.language)
        if grammar:
            self.parser = RegexpParser(grammar)
        else:
            raise ValueError(
                "Invalid or unsupported language: '%s'. Please use one of the currently supported ones: %s" % (
                    self.language, self.grammars.keys())
            )

        for lemma, match_tokens in self.lemma_to_token.iteritems():
            self.lemma_to_token[lemma] = set([match.lower() for match in match_tokens])
Esempio n. 49
0
 def parse_sent(self, pos_tagged_sentence, grammar=None):
       #wq = csv.writer(open('wiki2.csv','w'))
   parsed_tagged_sents = []
   for grammar in GRAMMARS:
     #if not grammar:
       #grammar = r"""
       #  NP_CHUNK: {<VBP|VBG|VB|VB*|IN|JJ>*<NNP|NN>*<VBP|VBG|VB|IN|JJ>?}
       #  """
     parsedsent = []
     #parsed_tagged_sents = []
     cp = RegexpParser(grammar)
     for sentence in pos_tagged_sentence:
       result = cp.parse(sentence)
       #print result
       for node in result:
         if str(type(node)) == "<class 'nltk.tree.Tree'>":
           #wq.writerow((grammar.strip(),sentence,node.leaves()))
           temp = ' '.join(word for word, POS in node.leaves())
           if len(temp.split()) >= 2:
             parsedsent.append(temp)
             parsed_tagged_sents.append(node.leaves())
             #print grammar,node.leaves()
   return parsed_tagged_sents
 def __init__(self):
     """
     """
     self.filter = DefaultFilter()
     self.tokenizer = getUtility(ITokenizer,
         name="collective.classification.tokenizers.NLTKTokenizer")
     self.tagger = getUtility(IPOSTagger,
             name="collective.classification.taggers.PennTreebankTagger")
     self.tagger_metadata = {'type':'Pen TreeBank','categories':[]}
     self.np_grammar = r"""
         NP: {<JJ>*<NN>}         # chunk determiners, adjectives and nouns
             {<NNP>+}            # chunk proper nouns
             """
     self.np_finder = RegexpParser(self.np_grammar)
Esempio n. 51
0
def compare(sentence, grammar):
    """
    Compare sentence against a grammar rule to see if any matches
    are found

    Paramaters
    ----------
    sentence: str
        a single sentence for which matches are to be found

    grammar: str
        grammar rule in regexp format

    Returns
    -------
    matches: nltk.tree.Tree
        all matches with the grammar rule

    """
    matches = []

    # Apply grammar rule
    cp = RegexpParser(grammar)
    chunk = cp.parse(sentence)
    # Identify label of the rule
    label = grammar.split(':')[0]

    for n in chunk:
        if isinstance(n, nltk.tree.Tree):
            if n.label() == label:
                matches.append(n)

    if matches == []:
        matches.append('None')

    return matches
Esempio n. 52
0
class Chunker:
   def __init__(self):
      grammar = r"""
         NP: {<DT>?<JJ.*|CD>*<NN.*>+}
         NP: {<NP><of><NP>}            # need to change tags of "of" to <of>!!
         NP: {<NP><in><NP>}            # need to change tags of "of" to <of>!!
      """
      self.parser = RegexpParser(grammar)

   def parse(self, sent):
      """
      sent should be a list of tuples of word and tag
      """
      for i, (word, pos) in enumerate(sent):
         if word == 'of' or word == 'in':
            sent[i] = (word, word)
      return self.parser.parse(sent)

   def print_chunks(self, tree, label):
      for node in tree:
         if type(node) == Tree and node.node == label:
            print node.leaves()

   def get_chunks(self, tree, label):
      """
      return a list of ranges (tuples) marking the start and end index of the chunk
      """
      offset = 0
      chunks = []
      for node in tree:
         if type(node) == Tree and node.node == label:
            phrase_size = len(node.leaves())
            chunks.append((offset, offset + phrase_size - 1))
            offset += phrase_size
         else:
            offset += 1
      return chunks
class GrammarExtractor(SentenceExtractor):
    """ Grammar-based extraction strategy: pick sentences that comply with a pre-defined grammar. """

    splitter = None
    parser = None
    # Grammars rely on POS labels, which are language-dependent
    grammars = {
        'en': r"""
                NOPH: {<PDT>?<DT|PP.*|>?<CD>?<JJ.*|VVN>*<N.+|FW>+<CC>?}
                CHUNK: {<NOPH>+<MD>?<V.+>+<IN|TO>?<NOPH>+}
               """,
        'it': r"""
                SN: {<PRO.*|DET.*|>?<ADJ>*<NUM>?<NOM|NPR>+<NUM>?<ADJ|VER:pper>*}
                CHUNK: {<SN><VER.*>+<SN>}
               """,
    }

    def setup_extractor(self):
        self.splitter = PunktSentenceSplitter(self.language)
        grammar = self.grammars.get(self.language)
        if grammar:
            self.parser = RegexpParser(grammar)
        else:
            raise ValueError(
                "Invalid or unsupported language: '%s'. Please use one of the currently supported ones: %s" % (
                    self.language, self.grammars.keys())
            )

        for lemma, match_tokens in self.lemma_to_token.iteritems():
            self.lemma_to_token[lemma] = set([match.lower() for match in match_tokens])

    def extract_from_item(self, item):
        extracted = []
        url = item.get('url')
        if not url:
            logger.warn('skipping item without url')
            return

        document = item.get(self.document_key)
        if not document:
            return
        elif isinstance(document, list):
            document = '\n'.join(document)

        # Sentence splitting
        sentences = self.splitter.split(document)
        tokens = 0
        for sentence in sentences:
            tagged = [(token, pos) for token, pos, lemma in self.tagger.tag_one(sentence)]

            # Parsing via grammar
            parsed = self.parser.parse(tagged)

            # Loop over sub-sentences that match the grammar
            for grammar_match in parsed.subtrees(lambda t: t.label() == 'CHUNK'):
                logger.debug("Grammar match: '%s'" % grammar_match)
                # Look up the LU
                for token, pos in grammar_match.leaves():
                    # Restrict match to sub-sentence verbs only
                    if pos.startswith('V'):
                        for lemma, match_tokens in self.lemma_to_token.iteritems():
                            if token.lower() in match_tokens:
                                # Return joined chunks only
                                # TODO test with full sentence as well
                                # TODO re-constitute original text (now join on space)
                                text = ' '.join([leaf[0] for leaf in grammar_match.leaves()])
                                logger.debug("Extracted sentence: '%s'" % text)
                                logger.debug("Sentence token '%s' is in matches %s" % (token, match_tokens))
                                logger.debug("Extracted sentence: %s" % text)
                                extracted.append({
                                    'lu': lemma,
                                    'text': text,
                                    'tagged': tagged,
                                    'url': url,
                                })

        if extracted:
            logger.debug("%d sentences extracted. Removing the full text from the item ...", len(extracted))
            item.pop(self.document_key)
            return item, extracted
        else:
            logger.debug("No sentences extracted. Skipping the whole item ...")
Esempio n. 54
0
    del sents

    t3 = datetime.now()

    # pos_sents=
    # [[('In', 'IN'),
    #   ('the', 'DT'),
    #   ('land', 'NN'),
    #   ('of', 'IN'),
    #   ('submarines', 'NNS'),
    #   ('.', '.')],
    #    .... ]

    matcheur = RegexpParser(
    """
    truc:
            {<JJ.*>*<NN.*>+(<P|IN> <JJ.*>*<NN.*>+)*}
    """
    )

    # # lecture/analyse expressions correspondant à notre recherche
    recog_trees = []
    for s in pos_sents:
        reconnu = matcheur.parse(s)
        recog_trees.append(reconnu)
    del pos_sents

    t4 = datetime.now()
    # [('We', 'PRP'), ('all', 'DT'), ('live', 'VBP'), ('in', 'IN'), ('a', 'DT'), ('yellow', 'JJ'), ('submarine', 'NN'), ('.', '.')]
    # (S
    #   We/PRP
    #   all/DT
Esempio n. 55
0
class Extraction:

    """This class is used to extract nouns, proper nouns, phrases from text"""

    def __init__(self, word_tokenize=None, sent_tokenize=None,
                 pos_tag=None, stop_words=None, punct=None,
                 grammar=chunk_grammar_propernouns):
        self._word_tokenize = word_tokenize if word_tokenize else nltk.word_tokenize
        self._sent_tokenize = sent_tokenize if sent_tokenize else nltk.sent_tokenize
        self._pos_tag = pos_tag if pos_tag else nltk.pos_tag
        self._stop_words = stop_words if stop_words else set(nltk.corpus.stopwords.words('english'))
        self._punct = punct if punct else set(string.punctuation)
        self._chunk_grammar = grammar
        self._chunker = RegexpParser(self._chunk_grammar)

    def extract_chunks_sent(self, sent):
        """
        Extract chunk phrases from a sentence.
        :param sent: a sentence level text.
        :return: chunk phrases
        """
        tags = self._pos_tag(self._word_tokenize(sent))
        chunks = nltk.chunk.tree2conlltags(self._chunker.parse(tags))
        # join constituent chunk words into a single chunked phrase
        return [' '.join(word for word, pos, chunk in group)
                  for key, group in itertools.groupby(chunks, lambda (word, pos, chunk): chunk != 'O') if key]

    def extract_chunks_doc(self, text):
        """
        Extract chunk phrases from a document.
        :param text: a document level text
        :return: chunk phrases
        """
        sents = self._sent_tokenize(text)
        sents = [s for s in sents if s]
        return list(itertools.chain.from_iterable(map(self.extract_chunks_sent, sents)))

    def extract_words_sent(self, sent, good_tags=set(['NN', 'NNS'])):
        """
        Extract desired words from a sentence.
        :param sent: a sentence level text
        :param good_tags: desired word tags
        :return: words with desired word tags
        """
        tagged_words = self._pos_tag(self._word_tokenize(sent))
        words = [word for word, tag in tagged_words
                if tag in good_tags and word.lower() not in self._stop_words
                and not all(char in self._punct for char in word)]
        return list(set(words))

    def extract_words_doc(self, text, good_tags=set(['NN', 'NNS'])):
        """
        Extract desiredwords from document
        :param text: a document level text
        :param good_tags: desired word tags
        :return: words with desired word tags
        """
        sents = self._sent_tokenize(text)
        sents = [s for s in sents if s]
        func_extract = lambda x: self.extract_words_sent(x, good_tags)
        words = list(itertools.chain.from_iterable(map(func_extract, sents)))
        return list(set(words))
Esempio n. 56
0
 def __init__(self):
     from nltk import RegexpTagger
     from nltk import RegexpParser
     self.tagger = RegexpTagger(patterns)
     self.chunker = RegexpParser(grammar, trace=COPYRIGHT_TRACE)
Esempio n. 57
0
def InfoExtractor(text):

	### Regex Expressions ###
	#########################
	regex_email = re.compile(r'([a-zA-Z0-9._-]+@[a-zA-z0-9._-]+\.[^\s]*)',re.IGNORECASE | re.UNICODE)
	regex_phone = re.compile(r'(\d+[\-\+\(]?\d+[\)\-\s]?\d+[\-\s]?\d+)', re.UNICODE)
	
	regex_DOB = re.compile(r'([0-3]?[0-9](?:\.|\/|\-|\s)[0-3]?(?:[0-9]|' + 
		r'(?:Feb|Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January' +
		r'|February|March|April|May|June|July|August|September|October|' +
		r'November|December))(?:\.|\/|\-|\s)(?:[0-9]{2})?[0-9]{2})',re.IGNORECASE | re.UNICODE)

	#regex_phone = re.compile(r'\s*(?:\+?(\d{1,3}))?([-. (]*(\d{3})[-. )]*)?((\d{3})[-. ]*(\d{2,4})(?:[-.x ]*(\d+))?)$',re.IGNORECASE | re.UNICODE)
	
	info = dict()

	regex = {
	'email':regex_email,
	'phone':regex_phone,
	'DOB':regex_DOB
	}

	for exp in regex.keys():
		info[exp] = regex[exp].findall(text)

	#Filtering phone numbers
	info['phone'] = [x for x in info['phone'] if len(x)>5]

	print text

	### Sent Tokenize ###
	######################
	
	sent = sent_tokenize(text.decode("utf8"))
	print sent
	print
	### Word Tokenize ###  
	#####################

	sent = [ word_tokenize(word) for word in sent ]
	#print words

	sent = [pos_tag(word) for word in sent]
	#print sent[0]
	#print sent
	#print sent[1]

	grammar = "NP: {<DT>?<JJ>*<NN>}"

	cp = RegexpParser(grammar)
	result = cp.parse(sent[0])
	#print result
	#result.draw()

	#print sent
	'''
	raw_tuples = sent[0].split('\n')

	for line in raw_tuples:
		try:
			key, value = line.split('\t')
			print key, value
			print
		except:
			pass 

	'''

	#return None
	return info
Esempio n. 58
0
	return token

def sentimentanalysis(texto):
	testimonial = TextBlob(texto)
	for zen in testimonial.words:
		print zen.translate(to="en")


chunker = RegexpParser("""
ENTI:
    {<NNP|NNPS>+<NNP|NNPS|NN|NNS>}  # Nouns and Adjectives, terminated with Nouns
    {<NNP|NNPS><IN><NNP|NNPS>}
    {<NNP|NNPS>}
ENTIDACOMP:
	{<DT><NN|NNS><ENTI>}
	{<DT><NN|NNS><IN><ENTI>}
	{<ENTI><IN><ENTI>}	
    {<ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP>}  # Above, connected with in/of/etc...
    {<ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP>}
ENTIDACOMP2:
	{<ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP>}
    

""")
"""
NBAR:
    {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
NP:

    {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc... """
#sentimentanalysis("su")
print 'asadasdasdasdsadadsa'
class NPExtractor(Persistent):
    """
    """
    
    implements(ITermExtractor)
    
    def __init__(self):
        """
        """
        self.filter = DefaultFilter()
        self.tokenizer = getUtility(ITokenizer,
            name="collective.classification.tokenizers.NLTKTokenizer")
        self.tagger = getUtility(IPOSTagger,
                name="collective.classification.taggers.PennTreebankTagger")
        self.tagger_metadata = {'type':'Pen TreeBank','categories':[]}
        self.np_grammar = r"""
            NP: {<JJ>*<NN>}         # chunk determiners, adjectives and nouns
                {<NNP>+}            # chunk proper nouns
                """
        self.np_finder = RegexpParser(self.np_grammar)
    
    def _add(self,norm, terms):
        terms.setdefault(norm, 0)
        terms[norm] += 1
    
    @ram.cache(_extractor_cachekey)
    def extract(self,text):
        """
        """
        tokens = self.tokenizer.tokenize(text)
        tagged_terms = self.tagger.tag(tokens)
        terms = {}
        np_terms = {}
        
        noun_phrases = [
            node
            for node in self.np_finder.parse(tagged_terms)
            if not isinstance(node,tuple)]
        
        for node in noun_phrases:
            coll_tag = tree2conlltags(node)
            if len(coll_tag) > 1:
                mterm = [
                    term.lower()
                    for (term,tag,temp) in coll_tag
                    if len(term)>1
                    ]
                
                mterm = ' '.join(mterm)
                self._add(mterm,np_terms)
            for (term,tag,temp) in coll_tag:
                if tag.startswith('N') and len(term)>1:
                    if tag in ['NNS','NNPS']:
                        term = singularize(term)
                    self._add(term.lower(),terms)
        
        for term in terms.keys():
            if not self.filter(term,terms[term]):
                del terms[term]
        
        for term in np_terms.keys():
            if not self.filter(term,np_terms[term]):
                del np_terms[term]
        
        return (terms,np_terms)
    
    def setTagger(self,tagger,tagger_metadata={}):
        self.tagger = tagger
        if not tagger_metadata:
            self.tagger_metadata['type']='unknown'
        else:
            self.tagger_metadata = tagger_metadata
Esempio n. 60
0
 def parse_features(self,review):
     cp = RegexpParser(self.grammar)
     return cp.parse(review)