Exemple #1
0
def parse_semantic_tag(tokenstring):
    """ Extension that appends the SEMANTIC tag to the output of the parser.
    """
    client = MBSP.Mbt(port=6066)
    # Find the semantic tag of words in the sentence.
    # Example: "macrophage/NN/I-NP/O/O/O/macrophage".
    s1 = tokenstring.split()
    # => [[[u'macrophage', u'NN', u'I-NP', u'O', u'O', u'O', u'macrophage']]]
    s2 = s1.reduce([MBSP.WORD])
    # => [[[u'macrophage']]]
    s2 = MBSP.TokenString(client.send(s2.join()), tags=[MBSP.WORD, SEMANTIC])
    # => macrophage/protein
    s2 = s2.split()
    # => [[[u'macrophage', u'protein']]]
    s1.tags.append(SEMANTIC, values=s2.tags.pop(s2.tags.index(SEMANTIC)))
    # => [[[u'macrophage', u'NN', u'I-NP', u'O', u'O', u'O', u'macrophage', u'protein']]]
    s1 = s1.join()
    # => macrophage/NN/I-NP/O/O/O/macrophage/protein
    client.disconnect()
    return s1
Exemple #2
0
def update_pos_tag(tokenstring):
    """ Event handler that fires when the MBSP parser is done tagging and chunking.
        Updates the part-of-speech tags from a specialized biomedical corpus.
        Returns the updated string to the parser.
    """
    client = MBSP.Mbt(port=6065)
    # Retag the part-of-speech tags with the GENIA corpus.
    # Example: "TGF-beta1-transcribing/NN/I-NP macrophages/NNS/I-NP"
    s1 = tokenstring.split() 
    # => [[[u'TGF-beta1-transcribing', u'NN', u'I-NP'], [u'macrophages', u'NNS', u'I-NP']]]
    s2 = s1.reduce([MBSP.WORD]) 
    # => [[[u'TGF-beta1-transcribing'], [u'macrophages']]]
    s2 = MBSP.TokenString(client.send(s2.join()), tags=[MBSP.WORD, MBSP.PART_OF_SPEECH])
    # => TGF-beta1-transcribing/JJ macrophages/NNS
    s2 = s2.split() 
    # => [[[u'TGF-beta1-transcribing', u'JJ'], [u'macrophages', u'NNS']]]
    s2.tags.append(MBSP.CHUNK, values=s1.tags.pop(s1.tags.index(MBSP.CHUNK)))
    # => [[[u'TGF-beta1-transcribing', u'JJ', u'I-NP'], [u'macrophages', u'NNS', u'I-NP']]]
    s2 = s2.join()
    # => TGF-beta1-transcribing/JJ/I-NP macrophages/NNS/I-NP
    client.disconnect()
    return s2
    def tag(self, sentence_list):
        multiword_expressions = []

        for sentence in sentence_list:
            sentence_parsed = MBSP.parse(sentence,
                                         chunks=False,
                                         relations=False,
                                         anchors=False)
            sentence_lemmatized = " ".join(
                [x.split("/")[2] for x in sentence_parsed.split(" ")])
            multiword_expressions += self.__pattern_1(sentence_lemmatized)
            multiword_expressions += self.__pattern_2(sentence_parsed)

        return multiword_expressions
Exemple #4
0
text = 'Automation is good for the economy. A world dominated by robots is a thing I am looking forward to!'
text = raw_input('Enter text: ')

parse_tree = parsetree(text)

for sentence in parse_tree:
    for chunk in sentence.chunks:
        for word in chunk.words:
            print str(word),
        print '\n', str(chunk)
        print

self_parse = []
construct = []
sentence = []
parsed_text = MBSP.parse(text)
parsed_sentences = parsed_text.split('O/.')
parsed_words = parsed_text.split(' ')
for word in parsed_words:
    element = [e.encode('ascii') for e in word.split('/')]
    construct.append(element)
    if element[1] == '.':
        sentence.append(construct)
        construct = []
    print element
print

# print str(parsed_text)
print[word[0] for word in sentence[0]]
Exemple #5
0
#### MEMORY-BASED SHALLOW PARSER ######################################################################

# Copyright (c) 2003-2010 University of Antwerp, Belgium and Tilburg University, The Netherlands
# License: GNU General Public License, see LICENSE.txt

######################################################################################################

# Add the upper directory (where the MBSP module is) to the search path.
import os, sys; sys.path.insert(0, os.path.join("..", ".."))
import MBSP

if not MBSP.config.autostart:
    MBSP.start()

s = MBSP.parse("I eat pizza with a fork.")
s = MBSP.split(s) # Yields a list of traversable Sentence objects.
      
for sentence in s:
    for chunk in sentence.chunks:
        print repr(chunk)
        print
        print "      Words:", chunk.words       # A list of Word objects.
        print "  Relations:", chunk.related     # A list of Chunk objects.
        print " Parent PNP:", repr(chunk.pnp)   # A PNPChunk object, or None.
        print "Related PNP:", chunk.attachments # A list of PNPChunk objects.
        print
        
# Remove the servers from memory when you're done:
# MBSP.stop()
Exemple #6
0
import MBSP

inpt = ""

with open('_raw/lemmaCase-v6.txt', 'r') as f:
    #	inS = f.read()
    #	parsed_string = MBSP.parse(inS)
    #	text = MBSP.Text(parsed_string, token=[WORD, POS, CHUNK, PNP, REL, ANCHOR, LEMMA])
    #	print text.sentences

    for line in f:
        inpt = line
        print inpt
        ustr = unicode(inpt, encoding="utf-8")
        tokenized_lemmas = MBSP.lemmatize(ustr, tokenize=True)
        print tokenized_lemmas
        #for lem in tokenized_lemmas:
        #print lem
        #sentence = MBSP.tokenize(ustr);
        #print tokenized_lemmas
        #pos_word = MBSP.tag(word, tokenize=False, lemmata=False)
        #print MBSP.lemma(word.encode('ascii', 'ignore'))
        #print '{} {}'.format(word[0].encode('ascii', 'ignore'), word[1])
Exemple #7
0
#### MEMORY-BASED SHALLOW PARSER ######################################################################

# Copyright (c) 2003-2010 University of Antwerp, Belgium and Tilburg University, The Netherlands
# License: GNU General Public License, see LICENSE.txt

######################################################################################################

# Add the upper directory (where the MBSP module is) to the search path.
import os, sys

sys.path.insert(0, os.path.join("..", ".."))
import MBSP

if not MBSP.config.autostart:
    MBSP.start()

s = MBSP.parse("I eat pizza with a fork.")
s = MBSP.split(s)  # Yields a list of traversable Sentence objects.

for sentence in s:
    for chunk in sentence.chunks:
        print repr(chunk)
        print
        print "      Words:", chunk.words  # A list of Word objects.
        print "  Relations:", chunk.related  # A list of Chunk objects.
        print " Parent PNP:", repr(chunk.pnp)  # A PNPChunk object, or None.
        print "Related PNP:", chunk.attachments  # A list of PNPChunk objects.
        print

# Remove the servers from memory when you're done:
# MBSP.stop()
Exemple #8
0
def parse(*args, **kwargs):
    s = MBSP.parse(*args, **kwargs)
    s = parse_semantic_tag(s)
    return s
def queryGenerator(raw_input_string, change_sentiment):
    #
    #Step 0: Seperate if user is asking for meaning or for a debate response
    #
    word, isMeaning = MeaningExtractor.getIfMeaning(str(raw_input_string))
    if isMeaning:
        meaning = MeaningExtractor.getMeaning(word)
        return meaning, True

    #
    #Step 1: Obtain input from the user
    #
    s = str(raw_input_string)

    #
    #Step 2: Convert the sentence into blob and MBSP Sentence objects respectively
    #
    input_string = TextBlob(s)
    clipsSentence = MBSP.Sentence(MBSP.parse(s), token=[MBSP.WORD,MBSP.POS,MBSP.CHUNK,MBSP.PNP,MBSP.REL,MBSP.ANCHOR,MBSP.LEMMA])

    #
    #Step 3: Define the variable required for the analysis and interpretation of input
    #
    query = ""
    w = []
    subjPhrases, verbPhrases, predPhrases = [], [], []
    pNouns, verbs = [], []
    pnps, anchors = [], []

    #
    #Step 4: Obtain all the proper nouns from the sentence
    #
    for sentence in input_string.sentences:
        tagged = sentence.tags
        for word_tag in tagged:
            if word_tag[1]=='NNP' or word_tag[1]=='NNPS':
                pNouns.append(word_tag[0])
        
    #
    #Step 5: Obtain the different parts, i.e. subject, predicate, object of the sentence
    #
    for chunk in clipsSentence.chunks:
        if chunk.role == 'SBJ' and chunk not in subjPhrases:
            subjPhrases.append(chunk.string)
        elif (chunk.type == 'VP' or chunk.type == 'ADVP') and chunk not in verbPhrases:
            verbPhrases.append(chunk.string)
        elif (chunk.role == 'PRD' or chunk.role == 'OBJ') and chunk not in predPhrases:
            predPhrases.append(chunk.string)

    #
    #Step 6: Detect the noun phrase and the anchors corresponding to them(Ref:CLIPS docs)
    #
    pnps = clipsSentence.pnp
    for item in pnps:
        if item.anchor not in anchors:
            anchors.append(item.anchor)
    #
    #Step 7.0: Train the classifier for sentiment data
    #
    ###with open('sentiment_training_formatted.csv', 'r') as fp:
    ###    classifier = NaiveBayesClassifier(fp, format='csv')

    #
    #Step 7.1: Classify the user input and record the sentiment
    #
    '''def getSentiment(sentence):
        sentiObj= TextBlob(s, analyzer=NaiveBayesAnalyzer()).sentiment
        pos_ratio = sentiObj.p_pos
        neg_ratio = sentiObj.p_neg
        if pos_ratio>=neg_ratio:
            return 'pos'
        else:
            return 'neg'

    input_sentiment = getSentiment(input_string)'''
   
    #
    #Step 8: Generate the final query
    #
    for pNoun in pNouns:
        for sbj in subjPhrases:
            if pNoun not in subjPhrases:
                query = query+pNoun+" "
                #print 'pNoun:'+pNoun

    for sbj in subjPhrases:
        query = query+sbj+" "
        #print 'sbj:'+sbj

    for prd in predPhrases:
        query = query+prd+" "
        #print 'prd:'+prd

    for vr in verbPhrases:
        query = query+vr+" "
        #print 'vr:'+vr

    for anc in anchors:
        anc = anc.string
        query = query+anc+" "
        #print 'anc:'+anc

    for pnp in pnps:
        pnp = pnp.string
        query = query+pnp+" "
        #print 'pnp:'+pnp

    #
    #Step 9: [Blank]
    #
   
    #
    #Step 10: Remove repetitive words from the sentence
    #
    query_blob = TextBlob(query)
    wrds = query_blob.words
    final_words = []
    for wrd in wrds:
        #print wrd
        if str(wrd).lower() not in final_words:
            final_words.append(wrd)

    final_query = ""
    for wrd in final_words:
        final_query = final_query+wrd+" "

    #
    #Step 11: Sort the query words in order of the input
    #
    index_dict = {}
    indexes = []
    for word in TextBlob(final_query).words:
        try:
            index_dict[input_string.index(str(word))] = str(word)
            indexes.append(input_string.index(str(word)))
        except(ValueError):
            print "Word not in main string:", word

    indexes.sort()
    final_query = ""
    for index in indexes:
        final_query = final_query+index_dict[index]+" "

    #
    #Step 12.0: Filter query for articles: a,an,the,is
    #
    reps = {' a ':' ', ' an ':' ', ' the ':' ', ' is ':' '}
    query = replace_all(query, reps)

    if change_sentiment:
        #
        #Step 12.1: Build a dictionary of the replaceable words
        #
        #Note:make priority lists for different set of words
        def create_replace_dict(lines):
            replace_dict = {}
            for line in lines:
                kv = line.split(',')
                replace_dict[kv[0]] = kv[1][:-1]
            return replace_dict
        #
        #Step 12.2: Replace words from the given phrase
        #
        def replace_words(phrase):
            l = open('replace_list.csv', 'r').readlines()
            new_phrase = replace_all(phrase, create_replace_dict(l))
            if new_phrase==phrase:
                l=open('replace_list2.csv', 'r').readlines()
                new_phrase = replace_all(phrase, create_replace_dict(l))
            return new_phrase
        final_query = replace_words(final_query)

        #
        #Step 13: Get the sentiment of the final query
        #
        '''final_query_sentiment = getSentiment(final_query)'''

        #
        #Step 14: Print out the query
        #
        print "<------------------------------------------->"
        print "in:", s
        print "out:", final_query
        print "<------------------------------------------->"

        return final_query, False
        
    else:
        print "<------------------------------------------->"
        print "in:", s
        print "out:", s
        print "<------------------------------------------->"

        return s, False
Exemple #10
0
 def _tokenize_MBSP(self, txt):
     tokenized = MBSP.tokenize(txt)
     return unicode(tokenized)
Exemple #11
0
 def _parse_MBSP(self, txt):
     parsed = MBSP.parse(txt)
     return unicode(parsed)
Exemple #12
0
 def _chunk_MBSP(self, txt):
     chunked = MBSP.chunk(txt)
     return unicode(chunked)
Exemple #13
0
#### MEMORY-BASED SHALLOW PARSER ######################################################################

# Copyright (c) 2003-2010 University of Antwerp, Belgium and Tilburg University, The Netherlands
# License: GNU General Public License, see LICENSE.txt

######################################################################################################

# Add the upper directory (where the MBSP module is) to the search path.
import os, sys; sys.path.insert(0, os.path.join("..", ".."))
import MBSP

if not MBSP.config.autostart:
    MBSP.start()

q = 'I eat pizza with a fork.'
s = MBSP.parse(q,
     tokenize = True, # Split tokens, e.g. 'fork.' => 'fork' + '.'
         tags = True, # Assign part-of-speech tags => 'fork' = noun = NN.
       chunks = True, # Assign chunk tags => 'a' + 'fork' = noun phrase = NP.
    relations = True, # Find chunk relations: 'I' = sentence subject = NP-SBJ-1.
      anchors = True, # Find prepositional noun phrase anchors.
      lemmata = True) # Find word lemmata.

# Print the output of the parser in a readable table format.
# The tags assigned to each part-of-speech are listed at:
# http://www.clips.ua.ac.be/pages/mbsp-tags
MBSP.pprint(s)

# Print the output of the parser as XML:
print
print MBSP.xml(s)
          text = str(text).replace('\x00 ','').replace('\xef\xbf\xbd','')
          text = str(text).replace('\xf7','').replace('\xc3\xba','').replace('\xb6','').replace('\xa9','').replace('\xe2\x99\xaa','')
          text = str(text).replace('\xc3\xaf','').replace('\x5c','').replace('\xf1','').replace('\xe1','').replace('\xe7','').replace('\xfa','')
          text = str(text).replace('\xf3','').replace('\xed','').replace('\xe9','').replace('\xe0','').replace('\xae','').replace('\xc2','')
          text = str(text).replace('\xc3','').replace('\xa2','').replace('\xbf','')
          if text.isupper(): text = text.lower()
#         print text
      except IndexError:
          print line
          continue

# G. Remove clearly wrong unicode characters -- BOM, NULL (only utf8 hex works)
      line = str(line).replace('\x00 ','').replace('\xef\xbf\xbd','')
      print line,

# H. Parts of speech with MBSP -- resplit the text if needed
      try:
         pos = MBSP.chunk(text, tokenize=True, lemmata=True)
         for pos in pos.splitlines():
             pos = str(pos).replace(' ','|')
             print "".join([field[0],"|",field[1],"|POS_01","|",pos])
      except (UnicodeDecodeError, UnicodeEncodeError, IndexError, AssertionError):
         # Tag failed UTF-8 lines NA to enable repair
         print "".join([field[0],"|",field[1],"|POS_01","|NA"])
         continue

# I. Close the file
fp.close()

# EOF
Exemple #15
0
 def _lemmatize_MBSP(self, txt):
     lemmatized = MBSP.lemmatize(txt)
     return unicode(lemmatized)
Exemple #16
0
#### MEMORY-BASED SHALLOW PARSER ######################################################################

# Copyright (c) 2003-2010 University of Antwerp, Belgium and Tilburg University, The Netherlands
# License: GNU General Public License, see LICENSE.txt

######################################################################################################

# Add the upper directory (where the MBSP module is) to the search path.
import os, sys; sys.path.insert(0, os.path.join("..", ".."))
import MBSP

if not MBSP.config.autostart:
    MBSP.start()

s = MBSP.parse("I ate many slices of pizza with a fork.")
s = MBSP.split(s)

# A useful operation is to extract the heads in a sentence,
# for example to create a "normalized" sentence, or to construct a Timbl lookup instance.
# A head is the principal word in a chunk.
# We could retrieve the heads by iterating over Sentence.chunks, 
# but this would skip the loose words in between chunks (e.g. "and" or ","),
# which can also be useful, particularly in the case of contructing a lookup instance.
# Sentence.constituents() returns an in-order list of mixed Chunk and Word objects 
# that can be used for this purpose:
heads = []
for p in s[0].constituents(pnp=False):
    if isinstance(p, MBSP.Word):
        heads.append((
            p.index, 
            p.lemma))
Exemple #17
0
def chomp_wurds():
    wurds = request.json['wurds']

    print "Got wurds: " + wurds
    breakdown = MBSP.parse(wurds)
    return jsonify({'orig': wurds, 'breakdown': breakdown}), 201
Exemple #18
0
import MBSP

inpt = ""

with open('_raw/posCase-v5.txt', 'r') as f:
    inpt = f.read().replace('\n', ' ')

ustr = unicode(inpt, encoding="utf-8")

#tokenized_words = MBSP.tokenizer.split(ustr, tags=True, replace={}, ignore=[])
tokenized_words = MBSP.tag(ustr, tokenize=True, lemmata=False)

#MBSP.pprint(tokenized_words)
tokens_split = tokenized_words.split()
for words in tokens_split:
    for word in words:
        print '{} {}'.format(word[0].encode('ascii', 'ignore'), word[1])
Exemple #19
0
    import MBSP

# Ensure that the tokenizer's biomedical mode is enabled:
MBSP.tokenizer.BIOMEDICAL = True

# The biomedical parse() function is similar to MBSP's,
# but the output has an additional SEMANTIC tag at the end ('cell_type', 'NONE', ...)
# This tag ends up in Word.custom_tags when split() is called with the TokenString output.
SEMANTIC = 'semantic'

#--- INSTALL SERVERS ---------------------------------------------------------------------------------

MBSP.active_servers.append(
    MBSP.Server(
            name = 'biomedical_pos',
            port = 6065, 
         process = MBSP.MBT,
        features = {'-s' : os.path.join(MODULE, 'models', 'GENIAPOS.settings'),}))
        # All the server options are bundled in a .settings file.
    
MBSP.active_servers.append(
    MBSP.Server(
            name = 'biomedical_sem',
            port = 6066, 
         process = MBSP.MBT,
        features = {'-s' : os.path.join(MODULE, 'models', 'GENIASEM.settings'),}))
        # All the server options are bundled in a .settings file.

#--- EXTEND TAGGER/CHUNKER ---------------------------------------------------------------------------

def update_pos_tag(tokenstring):