Ejemplo n.º 1
0
    def __init__(self,
                 nbest=10,
                 overparsing=10,
                 only_parse=False,
                 stop_words=None):
        try:
            from bllipparser import RerankingParser
            # WARNING if only_parse=False, BllipParser depends on PyStanfordDependencies: pip install PyStanfordDependencies
        except ImportError:
            raise ImportError(
                'BllipParser not installed, perhaps it is not supported on OS X yet'
            )

        self.parser = RerankingParser.fetch_and_load('GENIA+PubMed',
                                                     verbose=True)
        # WARNING this can take a long while. Install manually: `python -mbllipparser.ModelFetcher -i GENIA+PubMed`
        """create a Reranking Parser from BllipParser"""
        self.parser.set_parser_options(nbest=nbest, overparsing=overparsing)
        """set parser options"""
        self.only_parse = only_parse
        """whether features should be used from the BllipParser"""
        self.stemmer = LancasterStemmer()
        """an instance of LancasterStemmer from NLTK"""
        self.stop_words = stop_words
        if self.stop_words is None:
            self.stop_words = stopwords.words('english')
Ejemplo n.º 2
0
    def load_biomodel(self):
        rrp = RerankingParser.fetch_and_load('GENIA+PubMed', verbose=True)
        for did in self.documents:
            for sentence in self.documents[did].sentences:
                sentence_text = [t.text for t in sentence.tokens]
                #echocall = Popen(["echo", sentence_text] , stdout=PIPE, stderr=PIPE)
                #nc_params = ["nc", "localhost", "4449"]
                #echocall.wait()
                #call = check_output(nc_params , shell=True, stdin=echocall.stdout)

                #res = call.communicate()
                #res = netcat("localhost", 4449, sentence_text)
                #print res.strip()
                #print
                res = rrp.parse(sentence_text)
                if len(res) > 0:
                    print res[0].ptb_parse
                    print sentence.parsetree
                    print
                    #print
                    sentence.bio_parse = str(res[0].ptb_parse)
                else:
                    print sentence_text
                    print "no parse"
                    sentence.bio_parse = sentence.parsetree
                    print
Ejemplo n.º 3
0
def parse_reports(data_path, sheet_name, file_path):

    report_data_file = xlrd.open_workbook(data_path)
    sheet = report_data_file.sheet_by_name(sheet_name)

    rrp = RerankingParser.fetch_and_load('GENIA+PubMed', verbose=True)
    sd = StanfordDependencies.get_instance(backend='subprocess')

    for i in range(910, 3852):
        finding = sheet.cell(i, 6).value
        with open(file_path, mode='a') as f:
            f.write('finding no.' + str(i))
            f.write('\n')
        sent_tokenize_list = sent_tokenize(finding)
        for j in range(len(sent_tokenize_list)):
            try:
                with open(file_path, mode='a') as f:
                    f.write('sentence no.' + str(j))
                    f.write('\n')
                sentence = sent_tokenize_list[j]
                tree = rrp.simple_parse(sentence)
                dependencies = sd.convert_tree(tree)
                for token in dependencies:
                    with open(file_path, mode='a') as f:
                        f.write(str(token))
                        f.write('\n')
            except:
                print('error!')
                with open(file_path, mode='a') as f:
                    f.write('error!!!')
                    f.write('\n')
Ejemplo n.º 4
0
    def __init__(self, rawTextFileName=None, outputXMLFileName=None):
        """Initializes the Preprocessor and returns it. This includes loading any models that will be used in multiple preprocessing methods (e.g. RerankingParser)

        Args:
            rawTextFileName (str): The name of the raw string narrative file
            outputXMLFileName (str): The name of the BLANK file to contain the intermediate output XML

        Returns:
            Preprocessor object


        """
        if rawTextFileName is not None:
            self.filename = rawTextFileName
            self.textList = {}

            #Initialize the XML file (minimizes XML I/O)
            self.xmlname = outputXMLFileName

            self.rrp = RerankingParser.fetch_and_load('GENIA+PubMed')
            self.parseText()

            #print file
        else:
            print "Need a text file!"
Ejemplo n.º 5
0
    def load_biomodel(self):
        rrp = RerankingParser.fetch_and_load('GENIA+PubMed', verbose=True)
        for did in self.documents:
            for sentence in self.documents[did].sentences:
                sentence_text = [t.text for t in sentence.tokens]
                #echocall = Popen(["echo", sentence_text] , stdout=PIPE, stderr=PIPE)
                #nc_params = ["nc", "localhost", "4449"]
                #echocall.wait()
                #call = check_output(nc_params , shell=True, stdin=echocall.stdout)

                #res = call.communicate()
                #res = netcat("localhost", 4449, sentence_text)
                #print res.strip()
                #print
                res = rrp.parse(sentence_text)
                if len(res) > 0:
                    print res[0].ptb_parse
                    print sentence.parsetree
                    print
                    #print
                    sentence.bio_parse = str(res[0].ptb_parse)
                else:
                    print sentence_text
                    print "no parse"
                    sentence.bio_parse = sentence.parsetree
                    print
Ejemplo n.º 6
0
 def __init__(self, nbest=10, overparsing=10, only_parse=False, stop_words=None):
     self.parser = RerankingParser.fetch_and_load('GENIA+PubMed', verbose=False)
     """create a Reranking Parser from BllipParser"""
     self.parser.set_parser_options(nbest=nbest, overparsing=overparsing)
     """set parser options"""
     self.only_parse=only_parse
     """whether features should be used from the BllipParser"""
     self.stemmer = LancasterStemmer()
     """an instance of LancasterStemmer from NLTK"""
     self.stop_words = stop_words
     if self.stop_words is None:
         self.stop_words = stopwords.words('english')
Ejemplo n.º 7
0
class SentenceParser():
    rrp = RerankingParser.fetch_and_load('WSJ-PTB3', verbose=False)

    @staticmethod
    def parse(sentence):
        return Tree.fromstring(SentenceParser.rrp.simple_parse(sentence))

    @staticmethod
    def tree_parse(sentence):
        return (SentenceParser.all_parses(sentence)[0]).ptb_parse

    @staticmethod
    def all_parses(sentence):
        return SentenceParser.rrp.parse(sentence)
Ejemplo n.º 8
0
    def parse_question(cls, question: str):
        """
        Parses given question into NLP tree

        :type question str
        :rtype: bllipparser.RerankingParser.Tree
        """
        if cls.instance is None:
            logger = logging.getLogger(cls.__name__)
            logger.info('Loading a parsing model for NLP...')

            # https://pypi.org/project/bllipparser/
            cls.instance = RerankingParser.fetch_and_load('WSJ-PTB3', verbose=False)

            logger.info('Model loaded')

        return cls.instance.parse(question)[0].ptb_parse
Ejemplo n.º 9
0
    def __init__(self, tagger, model):
        """
        Performs all necessary preprocessing

        :param tagger: Path to the Stanford NER Tagger
        :param model: Path to the model for the NER Tagger
        """

        # check if model for tokenizer exists
        try:
            nltk.data.find('punkt.zip')
        except:
            nltk.download('punkt')

        # init NER parser
        self.nerParser = StanfordNERTagger(tagger, model)

        # init Charniak parser
        self.rerankingParser = RerankingParser.fetch_and_load(
            'WSJ+Gigaword-v2')
from multiprocessing import pool
from bllipparser import RerankingParser
import string

def multi_phrase_parse(s, rrp):
	file_in = open('./reverb_out%s' % s)
	file_out = open('./phrase_out%s' % s)
	for line in file_in:
		sep_line = line.split('\t')
		sentence = sep_line[12]
		file_out.write(rrp.simple_parse(sentence) + '\n')
	file_out.close()

def multi_phrase(arg):
	return multi_phrase_parse(*arg)

if __name__ == '__main__':
    p = Pool()
    parameter = []
    parse = RerankingParser.fetch_and_load('GENIA+PubMed', verbose=True)
    suffix = string.lowercase[0:10]
 	suffix = list(suffix)
 	for ele in suffix:
 		parameter.append((ele, parse))
    p.map(multi_phrase, parameter)
	
Ejemplo n.º 11
0
 def __enter__(self):
     self.bllip = RerankingParser.fetch_and_load(self.model_name,
                                                 verbose=True)
     return self
	def __init__(self):
		super(PatternLearner, self).__init__()
		self.rrp = RerankingParser.fetch_and_load('GENIA+PubMed', verbose=True)
Ejemplo n.º 13
0
from bllipparser import RerankingParser

from kasami import TreeScorer
from kasami.normalizers import bllip

# Loading WSJ-PTB3 treebank into bllip's RerankingParser
bllip_rrp = RerankingParser.fetch_and_load('WSJ-PTB3')
bllip_parse = lambda s: bllip.normalize_tree(bllip_rrp.parse(s)[0].ptb_parse)

tree = bllip_parse("I am a little teapot")
print(tree)
print(tree.format(depth=1))

for production in tree:
    print(str(production))

sentences = [
    "I am a little teapot", "Here is my handle", "Here is my spout",
    "When I get all steamed up I just shout tip me over and pour me out",
    "I am a very special pot", "It is true",
    "Here is an example of what I can do", "I can turn my handle into a spout",
    "Tip me over and pour me out"
]

teapot_grammar = TreeScorer.from_tree_bank(bllip_parse(s) for s in sentences)

teapot_grammar.score(bllip_parse("Here is a little teapot"))
teapot_grammar.score(bllip_parse("It is my handle"))
teapot_grammar.score(bllip_parse("I am a spout"))
teapot_grammar.score(bllip_parse("Your teapot is gay"))
teapot_grammar.score(bllip_parse("Your mom's teapot is asldasnldansldal"))
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.spatial.distance import pdist, squareform
import sys
reload(sys)
sys.setdefaultencoding('utf-8')


from bllipparser import RerankingParser
import StanfordDependencies

source_path = '/home/admin6019/Downloads/testsentence'



rrp = RerankingParser.fetch_and_load('WSJ-PTB3', verbose=True)
nbest_list = rrp.parse('Why does a zebra have stripes and a giraffe has square spots?')
#questionParsed=rrp.simple_parse('Why does a zebra have stripes and a giraffe has square spots?')
print repr(nbest_list[0])
print nbest_list[0].ptb_parse #parse tree 
print nbest_list[0].parser_score #parser score 
print nbest_list[0].reranker_score # reranker score   
tokens = nbest_list[0].ptb_parse.sd_tokens()
for token in tokens:
     print token


for dirpath, dirs, files in os.walk(source_path):
    for file in files:
       	fname = os.path.join(dirpath, file)
       	print "fname=", fname
Ejemplo n.º 15
0
from bllipparser import RerankingParser
RerankingParser.fetch_and_load('GENIA+PubMed')
Ejemplo n.º 16
0
def main(transcript):

    # results = {"0": "1.0", "1": "0.9747",
    #            "2": "0.968", "3": "0.8859", "4": "0.7071"}
    # print(json.dumps(results))

    results = {}
    sentences = sent_tokenize(transcript)
    '''
        Declaration of constants and functions
    '''

    CONS_SATIRIC = 0
    CONS_RELIABLE = 1
    rrp = RerankingParser.fetch_and_load('WSJ-PTB3', verbose=False)
    foo = TripletExtraction()
    bar = SemanticSimilarityAnalysis()
    '''
        2 database tables for comparison of input 
    '''
    c.execute('SELECT title FROM reliable_news')
    reliable_news = [tup[0] for tup in c.fetchall()]

    c.execute('SELECT title FROM satirical_news')
    satirical_news = [tup[0] for tup in c.fetchall()]

    t = len(sentences)
    correct_classifications = 0
    for i in range(t):
        max_similarity = 0
        classification = -1
        max_sentence = ""

        inp = sentences[i]
        ''' 
            generates the tree and gets the SVO of the input sentence
        '''
        tree_inp = Tree(rrp.simple_parse(inp))
        svo_inp = foo.getSVO(tree_inp[0])
        '''
            comparison for satirical and reliable news
        '''
        for title in satirical_news:
            for subj in svo_inp['subject']:
                if subj[2] == 0:
                    continue
                words = [x.lower() for x in sentence_tokenizer.tokenize(title)]
                if subj[0] in words or singularize(subj[0]) in words:
                    tree_data = Tree(rrp.simple_parse(title))
                    svo_data = foo.getSVO(tree_data[0])

                    similarity_score1 = bar.get_similarities(svo_inp, svo_data)
                    '''
                        object and subject swapped to provde more possible comparisons
                    '''
                    svo_data['subject'], svo_data['object'] = svo_data[
                        'object'], svo_data['subject']
                    similarity_score2 = bar.get_similarities(svo_inp, svo_data)

                    similarity_score = similarity_score1 if similarity_score1 > similarity_score2 else similarity_score2
                    if similarity_score > max_similarity:
                        classification = 0
                        max_similarity = similarity_score
                        max_sentence = title
                    break

        for title in reliable_news:
            for sht in satiric_shits:
                title = title.replace(sht, "")
            for subj in svo_inp['subject']:
                if subj[2] == 0:
                    continue
                words = [x.lower() for x in sentence_tokenizer.tokenize(title)]
                if subj[0] in words or singularize(subj[0]) in words:
                    tree_data = Tree(rrp.simple_parse(title))
                    svo_data = foo.getSVO(tree_data[0])

                    similarity_score1 = bar.get_similarities(svo_inp, svo_data)
                    '''
                        object and subject swapped to provde more possible comparisons
                    '''
                    svo_data['subject'], svo_data['object'] = svo_data[
                        'object'], svo_data['subject']
                    similarity_score2 = bar.get_similarities(svo_inp, svo_data)

                    similarity_score = similarity_score1 if similarity_score1 > similarity_score2 else similarity_score2
                    if similarity_score > max_similarity:
                        classification = 1
                        max_similarity = similarity_score
                        max_sentence = title
                    break
        if classification == CONS_RELIABLE:
            results[str(i)] = str(round(max_similarity, 4))
        elif classification == CONS_SATIRIC:
            results[str(i)] = str(round(-max_similarity, 4))
        else:
            results[str(i)] = "0"
    print(json.dumps(results))
Ejemplo n.º 17
0
 def __init__(self, model="WSJ-PTB3"):
     super().__init__(RerankingParser.fetch_and_load(model, verbose=True))
Ejemplo n.º 18
0
    return []


warnings.filterwarnings('ignore')

rel_summary_all_doc = np.load("/home/yld8809/all_rel/tp_all_train.npy")
raw_doc_folder = "/home/yld8809/all_rel/txt_all_train/"

rel_summary_all_doc_test = np.load("/home/yld8809/all_rel/tp_all_test.npy")
raw_doc_folder_test = "/home/yld8809/all_rel/txt_all_test/"

model = gensim.models.KeyedVectors.load_word2vec_format(
    "/home/yld8809/semrel/mimic3_pp300.txt", binary=False)
model_size = 300

rrp = RerankingParser.fetch_and_load('GENIA+PubMed', verbose=True)

raw_ind = glob.glob(raw_doc_folder + '/*.txt')
raw_ind.sort()

raw_ind_test = glob.glob(raw_doc_folder_test + '/*.txt')
raw_ind_test.sort()

num_doc = len(raw_ind)
num_doc_test = len(raw_ind_test)

word_embedding_all = np.empty(shape=[0, model_size + 7])
dep_mat_all = np.empty(shape=[0, 0])
de_parse_last = []
last_sent = []
Ejemplo n.º 19
0
def set_assertions_for_yesno_questions(data):
    rrp = RerankingParser.fetch_and_load('GENIA+PubMed', verbose=True)
    yesno = data.get_questions_of_type('yesno')
    for q in tqdm(yesno):
        q.assertion_pos = q2s(q.question, rrp)
Ejemplo n.º 20
0
class Preprocessor(object):
    """IMPORTANT: The list below stores multiple different forms of text, to minimize the amount of computation"""
    textList = {}
    _firstInitialization = True
    filename = ''
    #rrp = BllipParser.from_unified_model_dir('/home/vsocrates/.local/share/bllipparser/GENIA+PubMed')
    rrp = RerankingParser.fetch_and_load('GENIA+PubMed')

    def __init__(self, rawTextFileName, intermediateXMLFileName):
        """Initializes the Preprocessor and returns it. This includes loading any models that will be used in multiple preprocessing methods (e.g. RerankingParser)

        Args:
            rawTextFileName (str): The name of the raw string narrative file
            intermediateXMLFileName (str): The name of the BLANK file to contain the intermediate output XML

        Returns:
            Preprocessor object

        """
        if 'filename' in Preprocessor.textList and Preprocessor.textList[
                'filename'] == rawTextFileName:
            self.filename = Preprocessor.textList['filename']
            self.xmlname = intermediateXMLFileName

            return

        Preprocessor.textList = {}

        if rawTextFileName is not None:
            self.filename = rawTextFileName
            self.xmlname = intermediateXMLFileName
            Preprocessor.textList['filename'] = self.filename
            self.parseText()

        else:
            print "Need a text file!"
            return

    def getList(self):
        return Preprocessor.textList

    def parseText(self):
        """Creates the XML object and parses the raw narrative into the ElementTree python object. This method parses paragraphs, sentences,
        and tokenizes the text. Any additional features that need to be added into the XML file must have their own methods. 
           
        Args:
            None
        
        Returns:
            None
            It does write the parsed text to the file specified in the initializer

        """
        raw = self.rawText()
        rawOffsetIntermed = raw
        offsetIter = 0
        offsetIterSent = 0
        self.tree = ET.ElementTree(ET.Element('StartOutput'))
        self.root = self.tree.getroot()
        paraParent = ET.SubElement(self.root, 'Paragraphs')
        globalIDIndex = 0
        """Now we are breaking up by paragraph"""
        paraSplit = re.compile('\n').split(raw)
        # Originally, we were using RegEx to remove all the empty space elements in the list, but they are all '', so we are just going to compare directly for that. Use this again if you find that that is no longer the case.
        # paragraphPattern = re.compile('[^\s*]')
        # paragraphs = [i for i in paraSplit if not paragraphPattern.match(i)]
        paragraphs = [i for i in paraSplit if not i is '']

        paraParent.set('Count', str(len(paragraphs)))

        for index, paragraph in enumerate(paragraphs):
            tempParaElement = ET.Element('Paragraph',
                                         attrib={'id': str(index)})

            # We aren't currently including the paragraph text in the <Paragraph /> tag
            # tempParaElement.text =  paragraph
            paraParent.append(tempParaElement)
            """Now we have to sentence tokenize the text"""
            paragraph = re.sub(
                '-', ' ', paragraph
            )  ## Replace "-" with " " in the sentences, especially useful for extracting age
            sentList = sent_tokenize(paragraph)
            sentParent = ET.Element('Sentences')
            sentParent.set('Count', str(len(sentList)))
            tempParaElement.append(sentParent)
            for index, sent in enumerate(sentList):
                offsetIndexSent = rawOffsetIntermed.find(sent, offsetIterSent)
                tempSentElement = ET.Element(
                    'Sentence',
                    attrib={
                        'id':
                        str(index),
                        'offset':
                        str(offsetIndexSent) + ':' +
                        str(offsetIndexSent + len(sent))
                    })
                sentTextElem = ET.Element('Text')
                sentTextElem.text = sent
                tempSentElement.append(sentTextElem)
                sentParent.append(tempSentElement)
                offsetIterSent = offsetIndexSent
                """Now we have to break it down by token"""
                tokensList = word_tokenize(sent)
                tokenParent = ET.Element('Tokens')
                tokenParent.set('Count', str(len(tokensList)))
                tempSentElement.append(tokenParent)
                for index, word in enumerate(tokensList):
                    offsetIndex = rawOffsetIntermed.find(word, offsetIter)
                    tempWordElement = ET.Element(
                        'Token',
                        attrib={
                            'id':
                            str(index),
                            'globalID':
                            str(globalIDIndex),
                            'offset':
                            str(offsetIndex) + ':' +
                            str(offsetIndex + len(word))
                        })
                    textElem = ET.Element('Text')
                    textElem.text = word
                    tempWordElement.append(textElem)
                    tokenParent.append(tempWordElement)
                    offsetIter = offsetIndex
                    globalIDIndex += 1

        self.writeToXML()

    def rawText(self):
        """Returns the raw string (usually only used for RegEx extractors that don't want any preprocessing/XML)

        Args:
            None
            
        Returns
            The raw string from the text file (str)
        """
        if Preprocessor.textList.get('rawText') is None:
            file = open(self.filename)
            raw = file.read()
            rawUnicode = raw.decode('utf-8')
            raw = self.unicodeToASCII(rawUnicode)
            Preprocessor.textList['rawText'] = raw
            file.close()
        return Preprocessor.textList.get('rawText')

    def timexTagText(self, altText=None):
        """Tags all the temporal expressions and surrounds them with <TIMEX2> XML tags in line with the text

        Args:
            altText (str) The text to be tagged, if it is not the same as the whole narrative the preprocessor was created with. This text won't be stored.
            
        Returns:
            tagged text (str)
        
        """
        """When altText is specified, the method assumes that some random text is being sent to be tagged, so doesn't save in dictionary"""
        if altText is not None:
            raw = altText
            altOutput = timex.tag(raw)
            return altOutput

        else:
            """Otherwise, we first check if it exists in the textList dict, if not, it is created and returned"""
            raw = self.rawText()
            if Preprocessor.textList.get('timexTagText') is None:
                Preprocessor.textList['timexTagText'] = timex.tag(raw)

        return Preprocessor.textList.get('timexTagText')

    def wordTokenizeText(self, altText=None):
        """Tokenizes all the words currently using the nltk TreebankTokenizer for words, and the Punkt sentence tokenizer.
        
        Args:
            altText (str) The text to be tagged, if it is not the same as the whole narrative the preprocessor was created with. This text won't be stored.

        Returns:
            tokenized text (nested list, by sentence): 
            ex. [['This', 'is', 'a', 'sentence', '.'],['And', 'maybe', 'another']]
        """
        if altText is not None:
            raw = altText
            altTokenizedText = [word_tokenize(t) for t in sent_tokenize(raw)]
            return altTokenizedText

        else:
            raw = self.rawText()
            if Preprocessor.textList.get('wordTokenizeText') is None:
                Preprocessor.textList['wordTokenizeText'] = [
                    word_tokenize(t) for t in sent_tokenize(raw)
                ]
            else:
                print "Didn't create one!!"

        return Preprocessor.textList.get('wordTokenizeText')

    def timexTagAndTokenizeText(self, altText=None):
        """Tags temporal expressions with nltk timex2, and tokenizes the resultant text.

        Args:
            altText (str) The text to be tagged, if it is not the same as the whole narrative the preprocessor was created with. This text won't be stored.
        
        Returns:
            tokenized text (nested list, by sentence): 
            ex. [['This', 'is', 'a', 'sentence', '.'],['And', 'maybe', 'another']]

        """
        """In this method, two steps are required, so if altText is specified, all steps are done inside the if statement, so incorrect dict entries aren't stored"""
        if altText is not None:
            raw = altText
            altOutputStep1 = self.timexTagText(raw)
            altOutputStep2 = self.wordTokenizeText(altOutputStep1)
            time_tagged_and_tokenizedText = MWETokenizer(
                mwes=[('<', '/TIMEX2', '>'), ('<', 'TIMEX2', '>')],
                separator='').tokenize(altOutputStep2)

            return time_tagged_and_tokenizedText
        else:
            """Tag all temporal expressions with timex2 tags."""
            """Don't need to open file here, because it's opened in timexTagText()"""
            tagged = self.timexTagText()
            """Word-tokenize all text above"""
            word_tagged = self.wordTokenizeText(tagged)
        '''consolidate all broken apart Timex2 tags into single "words"'''
        if Preprocessor.textList.get('timexTagAndTokenizeText') is None:
            nestedListOutput = [
                MWETokenizer(mwes=[('<', '/TIMEX2', '>'),
                                   ('<', 'TIMEX2', '>')],
                             separator='').tokenize(x) for x in word_tagged
            ]

            #We need to remove and change this line if we don't want flattened (one dimensional list). Read below comment.
            Preprocessor.textList['timexTagAndTokenizeText'] = [
                item for sublist in nestedListOutput for item in sublist
            ]
        """Currently, the output is a flattened list, we need to decide if we want to keep the sentence structure (making the output a list of lists.
        This throws off the AEExtractor and the SuspectExtractor, which need to then be fixed."""
        return Preprocessor.textList.get('timexTagAndTokenizeText')

    def posTaggedText(self, altText=None):
        """Tags the text with parts-of-speech (POS) using the Charniak-Johnson parser after nltk tokenizes the words using the Penn Treebank tokenizer. 

        Args:
            altText (str) The text to be tagged, if it is not the same as the whole narrative the preprocessor was created with. This text won't be stored.
        
        Returns:
            the POS-tagged text (nested list)
            ex. [[('A', 'DT'), ('female', 'JJ'), ('patient', 'NN'), ('died', 'VBD'), ('while', 'IN'), ('receiving', 'VBG'), ('Taxol', 'NN'), ('therapy', 'NN'), ('.', '.')], [('She', 'PRP'), ('did', 'VBD'), ("n't", 'RB'), ('surive', 'VB'), ('.', '.')]]
        
        """
        self.parseXML()

        if altText is not None:
            raw = altText
            altOutputStep1 = self.wordTokenizeText(raw)
            altOutputStep2 = [
                Preprocessor.rrp.tag(sent) for sent in altOutputStep1
            ]
            return altOutputStep2
        else:

            if Preprocessor.textList.get('posTaggedText') is None:
                posTaggedSents = []
                paragraphs = self.root.find('Paragraphs')
                for paragraph in paragraphs.findall('Paragraph'):
                    sentences = paragraph.find('Sentences')
                    for sentence in sentences.findall('Sentence'):
                        tokens = sentence.find('Tokens')
                        #We have to take the first element, because for some reason, wordTokenizeText outputs a nested list, even with only one sentence
                        words = self.wordTokenizeText(
                            sentence.find('Text').text)[0]
                        """We have to check if words is empty or not, otherwise segfault"""
                        if words:
                            posTagList = Preprocessor.rrp.tag(words)
                            posTaggedSents.append(posTagList)
                            for index, token in enumerate(
                                    tokens.findall('Token')):
                                token.attrib['POSTag'] = posTagList[index][1]

                Preprocessor.textList['posTaggedText'] = posTaggedSents
                self.writeToXML()
            else:
                return Preprocessor.textList.get('posTaggedText')

    def getParseTree(self, altText=None):
        """
        Creates a parse tree using the POS tags in the intermediate XML (the method above) and the Charniak-Johnson parser. 
        
        Args:
            altText (str) The text to be tagged, if it is not the same as the whole narrative the preprocessor was created with. This text won't be stored.
        
        Returns:
            The parse tree created (str)
        """

        self.parseXML()
        """In order to use the BLLIP parser (Charniak-Johnson parser) we must tokenize by sentence first. When using the alternate text option
        you have to only pass it individual sentences, like other methods (TODO: make sure this is the case for other methods)
        """
        if altText is not None:
            raw = altText
            altOutputStep1 = self.wordTokenizeText(raw)
            altParseTree = Preprocessor.rrp.simple_parse(altOutputStep1)
            return altParseTree
        else:
            # Since we are doing an I/O anyway to input the new XML tags, we don't have to retokenize, and can use the information from the base XML document
            # sent_tokens = sent_tokenize(raw)
            # output = [rrp.simple_parse(sent) for sent in sent_tokens]
            if Preprocessor.textList.get('getParseTree') is None:
                parsedTreeList = []
                paragraphs = self.root.find('Paragraphs')
                for paragraph in paragraphs.findall('Paragraph'):
                    sentences = paragraph.find('Sentences')
                    for sentence in sentences.findall('Sentence'):
                        tempParseTreeElement = ET.Element('ParseTree')
                        # We have to take the first element, because for some reason, wordTokenizeText outputs a nested list, even with only one element
                        text = sentence.find('Text').text
                        """Only going to create a parse tree if there is some alphanumeric character and a period, otherwise parser crashes"""
                        if re.search('\w+\.?', text):
                            tempParseTreeElement.text = Preprocessor.rrp.simple_parse(
                                self.wordTokenizeText(text)[0])
                            parsedTreeList.append(tempParseTreeElement.text)
                        else:
                            parsedTreeList.append([])
                            pass
                            """Currently, if the sentence doesn't have any alphanumeric characters (followed by a period), nothing will be entered in the text,
                            but a ParseTree object will still be created and added."""
                        sentence.append(tempParseTreeElement)

                Preprocessor.textList['getParseTree'] = parsedTreeList
                self.writeToXML()
            else:
                return Preprocessor.textList.get('getParseTree')

    def getMetaMapConcepts(self, altText=None):
        """
        Returns the MetaMap concepts found using the 'pymetamap' python wrapper. 
        
        Args:
            altText (str) The text to be tagged, if it is not the same as the whole narrative the preprocessor was created with. This text won't be stored.
        
        Returns:
            the MetaMap concepts, as described in the pymetamap documentation (list)
        """
        if Preprocessor.textList.get("getMetaMapConcepts") is None:
            self.parseXML()
            mm = MetaMap.get_instance('/work/tkakar/public_mm/bin/metamap14')
            rawText = self.rawText()

            concepts, error = mm.extract_concepts([rawText])
            pattern = re.compile(
                '(\[(?:(orch|phsu|sosy|dsyn),?(orch|phsu|sosy|dsyn)?)\])')
            globalIDByConcept = {}
            returnedList = []
            for concept in concepts:
                if not hasattr(concept, 'aa'):
                    #TODO, see if there is any information that we are missing due to some combination not described by the Regex
                    match = pattern.search(concept.semtypes)
                    if match:
                        returnedList.append(concept)
                        posInfo = concept.pos_info
                        triggerInfo = concept.trigger.split('-')
                        conceptName = triggerInfo[3]
                        #need to replace the quotes in the conceptName
                        conceptName = conceptName.replace('"', '')

                        if ';' or '^' in posInfo:
                            posInfoList = self.offsetParse(posInfo, ';')
                        else:
                            posInfoList = self.offsetParse(posInfo)
                            #We need to change the format of the posInfos from (offset,span) to (offsetStartIndex, offsetEndIndex) here:
                        posInfoList = [(offset, span + offset)
                                       for (offset, span) in posInfoList]

                        for listIndex, (startIndex,
                                        endIndex) in enumerate(posInfoList):
                            lfNum = rawText.count('\n', 0, startIndex)
                            lastIdx = rawText.rfind(
                                conceptName, 0, startIndex + len(conceptName))
                            #you're going to forget this tomorrow morning, so this is the number of line feeds between the last instance of the concept name and where metamap thinks the word is.
                            lfNumSpecific = rawText.count(
                                '\n', lastIdx, startIndex)
                            #For some reason, we need to subract one at the end, TODO: Figure out why
                            posInfoList[listIndex] = (startIndex -
                                                      (lfNum + 1) +
                                                      lfNumSpecific - 1,
                                                      endIndex - (lfNum + 1) +
                                                      lfNumSpecific - 1)

                        globalIDList = []
                        #we have the fixed offsets for each mention of the semantic type. we now need to find their location in the xml file.
                        for newStartIdx, newEndIdx in posInfoList:
                            #                        print "newStartIdx: ", newStartIdx
                            #                        print "newEndIdx: ", newEndIdx
                            globalIds = self.placeOffsetInXML(
                                conceptName, word_tokenize(conceptName),
                                newStartIdx, newEndIdx - newStartIdx)
                            globalIDList.append(globalIds)

                        globalIDByConcept[concept] = globalIDList

            for key, value in globalIDByConcept.iteritems():
                for gIDList in value:
                    for gID in gIDList:
                        conceptXMLTag = self.root.find(".//*[@globalID='" +
                                                       str(gID) + "']")
                        tempMetaMapElem = ET.Element("METAMAP")
                        tempMetaMapElem.text = key.semtypes.replace("'", '')
                        conceptXMLTag.append(tempMetaMapElem)

            Preprocessor.textList['getMetaMapConcepts'] = returnedList
            self.writeToXML()

        return Preprocessor.textList.get('getMetaMapConcepts')

    def writeToXML(self):
        """Writes the tree to the output xml specified.

        Args:
            None

        Returns:
            None
        """
        self.tree.write(self.xmlname)  #, encoding='utf-8')

    def parseXML(self):
        """Parses the XML tree in the xml file specified. This method was created to minimize file I/Os.
        
        Args:
            None

        Returns:
            None
        """
        self.tree = ET.parse(
            self.xmlname)  #, parser=XMLParser(encoding='utf-8'))
        self.root = self.tree.getroot()

    def getRoot(self):
        self.parseXML()
        return self.root

    def placeOffsetInXML(self, phrase, tokenizedText, offset, span):
        """Takes a word/phrase and finds the globalIDs of the tokens in the intermediate XML that this word/phrase corresponds to. 
    
        Args:
            phrase (str) The string to be placed in XML
            tokenizedText (list) The tokenized text is used to ensure that the same tokenizer used on the rest of the document is kept consistent. 
            offset (int) The offset, in relation to the original text file
            span (int) The length of the string (currently unused)
        Returns:
            List of globalIDs (for tokens) that match the phrase (list) 
        """
        self.parseXML()
        tokenLength = len(tokenizedText)
        tokens = self.root.findall(".//Token")
        idsReturned = 0
        globalIDList = []
        foundOffsetFlag = False
        for token in tokens:
            if idsReturned >= tokenLength:
                break
            #In this case, we only ever get one offset at a time, so we don't loop through them. Just take the first (and only) element.
            (tokenStart,
             tokenEnd) = self.offsetParse(token.attrib['offset'])[0]
            if (offset == tokenStart or foundOffsetFlag):
                foundOffsetFlag = True
                globalIDList.append(int(token.attrib['globalID']))
                idsReturned += 1

        return globalIDList

    def offsetParse(self, offsetStr, delimiter=None):
        """Finds the offset and returns a tuple of starting and ending indices based on XML Format (0:34). Support multiple offsets, with delimiter specified. Returns in list format, even with only one element to keep consistency"""
        offsetIntList = []
        if delimiter is not None:
            """For some reason, the case where offsetParse() is used in the MetaMap preprocessing, sometimes the delimiter (that is normally a colon[:]) is replaced (randomly, it seems) or by a carrot (^)
            The regex below is support for that. """
            offsetList = re.split(
                delimiter.encode('string-escape') + r'|\^', offsetStr)
            for offset in offsetList:
                if ':' in offset:
                    colonLoc = offset.find(':')
                    offsetlist = [
                        int(offset[0:colonLoc]),
                        int(offset[colonLoc + 1:len(offset)])
                    ]
                    offsetIntList.append(offsetlist)
            return offsetIntList
        else:
            colonLoc = offsetStr.find(':')
            return [(int(offsetStr[0:colonLoc]),
                     int(offsetStr[colonLoc + 1:len(offsetStr)]))]

    def unicodeToASCII(self, string):
        """We are going to work solely in ascii, as it's easier for certain methods (i.e. word tokenization)"""
        string = string.replace(u"\u2019", r"'")
        string = string.replace(u"\u201C", r'"')
        string = string.replace(u"\u201D", r'"')
        string = string.replace(u"\u2013", r'-')
        #degrees
        string = string.replace(u"\u00B0", r'^')

        return string
Ejemplo n.º 21
0
 def __init__(self):
     self.rrp = RerankingParser.fetch_and_load('WSJ-PTB3', verbose=True)
     self.sd = StanfordDependencies.get_instance(backend='subprocess')