def test_minimal_communication_with_uuid(self):
     comm = Communication()
     comm.id = "myID"
     comm.metadata = AnnotationMetadata(tool="TEST",
                                        timestamp=int(time.time()))
     comm.type = "Test Communication"
     comm.uuid = generate_UUID()
     self.assertTrue(validate_communication(comm))
Beispiel #2
0
 def test_minimal_communication_with_uuid(self):
     comm = Communication()
     comm.id = "myID"
     comm.metadata = AnnotationMetadata(
         tool="TEST", timestamp=int(time.time()))
     comm.type = "Test Communication"
     comm.uuid = generate_UUID()
     self.assertTrue(validate_communication(comm))
Beispiel #3
0
def add_chunks_to_comm(comm, chunklink, fail_on_error):
    '''Converts the first constituency tree of each tokenization
    to chunks and adds them as a TokenTagging to the communication.
    
    comm - Communication to be annotated.
    chunklink - Path to the modified chunklink perl script.
    '''
    num_sents = 0
    num_chunked = 0
    try:
        for tokenization in get_tokenizations(comm):
            num_sents += 1
            try:
                if tokenization.parseList and len(tokenization.parseList) > 0:
                    parse = tokenization.parseList[0]
                    # Convert concrete Parse to a PTB style parse string to use as stdin for chunklink.
                    ptb_str = '( ' + penn_treebank_for_parse(parse) + ' )\n'
                    ptb_str = ptb_str.encode('ascii', 'replace')
                    logging.debug("PTB string: " + ptb_str)

                    # Run the chunklink script and capture the output.
                    try:
                        # We expect the chunklink script to be a modified version which can read a tree from stdin.
                        p = subprocess.Popen(['perl', chunklink],
                                             stdout=subprocess.PIPE,
                                             stderr=subprocess.PIPE,
                                             stdin=subprocess.PIPE)

                        stdouterr = p.communicate(input=ptb_str)
                        chunk_str = stdouterr[0]
                        chunk_err = stdouterr[1]
                        logging.debug("Chunklink stdout:\n" + chunk_str)
                        logging.debug("Chunklink stderr:\n" + chunk_err)
                        chunk_tags = get_chunks(chunk_str)
                        logging.debug("Chunk tags: " + str(chunk_tags))
                        if len(chunk_tags) != len(
                                tokenization.tokenList.tokenList):
                            raise Exception(
                                "ERROR: incorrect number of chunks. expected=%d actual=%d"
                                % (len(tokenization.tokenList.tokenList),
                                   len(chunk_tags)))

                        metadata = concrete.AnnotationMetadata()
                        metadata.tool = "Chunklink Constituency Converter"
                        metadata.timestamp = long(time.time())
                        # Extract the chunks column and create a TokenTagging from it.
                        chunks = concrete.TokenTagging()
                        chunks.uuid = concrete_uuid.generate_UUID()
                        chunks.metadata = metadata
                        chunks.taggingType = "CHUNK"
                        chunks.taggedTokenList = []
                        for i, chunk in enumerate(chunk_tags):
                            tt = concrete.TaggedToken()
                            tt.tokenIndex = i
                            tt.tag = chunk
                            chunks.taggedTokenList.append(tt)

                        # Add chunks to the list of TokenTaggings.
                        if not tokenization.tokenTaggingList:
                            tokenization.tokenTaggingList = []
                        tokenization.tokenTaggingList.append(chunks)
                        num_chunked += 1
                    except subprocess.CalledProcessError as e:
                        logging.error("Chunklink failed on tree: %s" %
                                      (ptb_str))
                        if fail_on_error: raise e
            except Exception as e:
                logging.exception("Chunking failed on tokenization")
                if fail_on_error: raise e
    except Exception as e:
        logging.exception("Chunking failed on Communication")
        if fail_on_error: raise e
    return num_chunked, num_sents
def add_chunks_to_comm(comm, chunklink, fail_on_error):
    '''Converts the first constituency tree of each tokenization
    to chunks and adds them as a TokenTagging to the communication.
    
    comm - Communication to be annotated.
    chunklink - Path to the modified chunklink perl script.
    '''
    num_sents = 0
    num_chunked = 0
    try:
        for tokenization in get_tokenizations(comm):
            num_sents += 1 
            try:       
                if tokenization.parseList and len(tokenization.parseList) > 0:
                    parse = tokenization.parseList[0]            
                    # Convert concrete Parse to a PTB style parse string to use as stdin for chunklink.
                    ptb_str = '( ' + penn_treebank_for_parse(parse) + ' )\n'
                    ptb_str = ptb_str.encode('ascii', 'replace')
                    logging.debug("PTB string: " + ptb_str)
                    
                    # Run the chunklink script and capture the output.
                    try:
                        # We expect the chunklink script to be a modified version which can read a tree from stdin.
                        p = subprocess.Popen(['perl', chunklink], stdout=subprocess.PIPE,
                                             stderr=subprocess.PIPE, stdin=subprocess.PIPE)

                        stdouterr = p.communicate(input=ptb_str)
                        chunk_str = stdouterr[0]
                        chunk_err = stdouterr[1]
                        logging.debug("Chunklink stdout:\n" + chunk_str)
                        logging.debug("Chunklink stderr:\n" + chunk_err)
                        chunk_tags = get_chunks(chunk_str)
                        logging.debug("Chunk tags: " + str(chunk_tags))
                        if len(chunk_tags) != len(tokenization.tokenList.tokenList):
                            raise Exception("ERROR: incorrect number of chunks. expected=%d actual=%d" % (len(tokenization.tokenList.tokenList), len(chunk_tags)))

                        metadata = concrete.AnnotationMetadata()
                        metadata.tool = "Chunklink Constituency Converter"
                        metadata.timestamp = long(time.time())
                        # Extract the chunks column and create a TokenTagging from it.
                        chunks = concrete.TokenTagging()
                        chunks.uuid = concrete_uuid.generate_UUID()
                        chunks.metadata = metadata                        
                        chunks.taggingType = "CHUNK"
                        chunks.taggedTokenList = []
                        for i, chunk in enumerate(chunk_tags):
                            tt = concrete.TaggedToken()
                            tt.tokenIndex = i
                            tt.tag = chunk
                            chunks.taggedTokenList.append(tt)
        
                        # Add chunks to the list of TokenTaggings.
                        if not tokenization.tokenTaggingList:
                            tokenization.tokenTaggingList = []
                        tokenization.tokenTaggingList.append(chunks)
                        num_chunked += 1
                    except subprocess.CalledProcessError as e:
                        logging.error("Chunklink failed on tree: %s" % (ptb_str))
                        if fail_on_error: raise e
            except Exception as e:
                logging.exception("Chunking failed on tokenization")
                if fail_on_error: raise e
    except Exception as e:
        logging.exception("Chunking failed on Communication")
        if fail_on_error: raise e
    return num_chunked, num_sents
 def test_next(self):
     comm = Communication()
     comm.uuid = generate_UUID()
Beispiel #6
0
 def test_next(self):
     comm = Communication()
     comm.uuid = generate_UUID()