Example #1
0
 def _next_from_stream(self):
     try:
         comm = Communication()
         comm.read(self.protocol)
         if self._add_references:
             add_references_to_communication(comm)
         return (comm, self._source_filename)
     except EOFError:
         self.transport.close()
         raise StopIteration
Example #2
0
 def _next_from_zip(self):
     if self.zip_infolist_index >= len(self.zip_infolist):
         raise StopIteration
     zipinfo = self.zip_infolist[self.zip_infolist_index]
     self.zip_infolist_index += 1
     comm = TSerialization.deserialize(
         Communication(),
         self.zip.open(zipinfo).read(),
         protocol_factory=factory.protocolFactory)
     if self._add_references:
         add_references_to_communication(comm)
     return (comm, zipinfo.filename)
Example #3
0
def read_communication_from_buffer(buf, add_references=True):
    '''
    Deserialize buf (a binary string) and return resulting
    communication.  Add references if requested.
    '''
    transport_in = TMemoryBuffer(buf)
    protocol_in = factory.createProtocol(transport_in)
    comm = Communication()
    comm.read(protocol_in)
    if add_references:
        add_references_to_communication(comm)
    return comm
Example #4
0
def read_communication_from_buffer(buf, add_references=True):
    '''
    Deserialize buf (a binary string) and return resulting
    communication.  Add references if requested.
    '''
    transport_in = TMemoryBuffer(buf)
    protocol_in = factory.createProtocol(transport_in)
    comm = Communication()
    comm.read(protocol_in)
    if add_references:
        add_references_to_communication(comm)
    return comm
Example #5
0
def _comm_with_properties(num_properties):
    ts = 17
    meta_tokn = AnnotationMetadata(tool='tokn-tool', timestamp=ts)
    toks = TokenList(tokenList=[
        Token(tokenIndex=0, text='text', textSpan=TextSpan(start=0, ending=1))
    ])
    tokn = Tokenization(uuid=generate_UUID(),
                        metadata=meta_tokn,
                        kind=TokenizationKind.TOKEN_LIST,
                        tokenList=toks)
    sentence = Sentence(uuid=generate_UUID(), tokenization=tokn)
    section = Section(uuid=generate_UUID(),
                      kind='kind',
                      label='label',
                      sentenceList=[sentence])
    trfs = TokenRefSequence(tokenizationId=tokn.uuid,
                            tokenIndexList=[0],
                            anchorTokenIndex=0)
    em = EntityMention(uuid=generate_UUID(),
                       entityType='entityType',
                       text='text',
                       tokens=trfs)
    meta_ems = AnnotationMetadata(tool='ems-tool', timestamp=ts)
    ems = EntityMentionSet(uuid=generate_UUID(),
                           metadata=meta_ems,
                           mentionList=[em])
    meta_prop = AnnotationMetadata(tool='Annotator1', timestamp=ts)
    props = list(
        Property(value="Property%d" % i, metadata=meta_prop, polarity=4.0)
        for i in range(num_properties))
    am = MentionArgument(role='role',
                         entityMentionId=em.uuid,
                         propertyList=props)
    sm = SituationMention(uuid=generate_UUID(), tokens=trfs, argumentList=[am])
    meta_sms = AnnotationMetadata(tool='sms-tool', timestamp=ts)
    sms = SituationMentionSet(uuid=generate_UUID(),
                              metadata=meta_sms,
                              mentionList=[sm])
    meta_comm = AnnotationMetadata(tool='tool', timestamp=ts)
    comm = Communication(uuid=generate_UUID(),
                         id='id',
                         text='text',
                         type='type',
                         metadata=meta_comm,
                         sectionList=[section],
                         situationMentionSetList=[sms],
                         entityMentionSetList=[ems])
    add_references_to_communication(comm)
    return comm
Example #6
0
def read_communication_from_file(communication_filename, add_references=True):
    """Read a Communication from the file specified by filename

    Args:

    - `communication_filename`: String with filename

    Returns:

    - A Concrete `Communication` object
    """
    comm = read_thrift_from_file(Communication(), communication_filename)
    if add_references:
        add_references_to_communication(comm)
    return comm
Example #7
0
def read_communication_from_file(communication_filename, add_references=True):
    """Read a Communication from the file specified by filename

    Args:

    - `communication_filename`: String with filename

    Returns:

    - A Concrete `Communication` object
    """
    comm = read_thrift_from_file(Communication(), communication_filename)
    if add_references:
        add_references_to_communication(comm)
    return comm
Example #8
0
def _comm_with_properties(num_properties):
    ts = 17
    meta_tokn = AnnotationMetadata(tool='tokn-tool', timestamp=ts)
    toks = TokenList(tokenList=[Token(tokenIndex=0,
                                      text='text',
                                      textSpan=TextSpan(start=0,
                                                        ending=1))])
    tokn = Tokenization(uuid=generate_UUID(), metadata=meta_tokn,
                        kind=TokenizationKind.TOKEN_LIST,
                        tokenList=toks)
    sentence = Sentence(uuid=generate_UUID(), tokenization=tokn)
    section = Section(uuid=generate_UUID(), kind='kind', label='label',
                      sentenceList=[sentence])
    trfs = TokenRefSequence(tokenizationId=tokn.uuid,
                            tokenIndexList=[0],
                            anchorTokenIndex=0)
    em = EntityMention(uuid=generate_UUID(), entityType='entityType',
                       text='text', tokens=trfs)
    meta_ems = AnnotationMetadata(tool='ems-tool', timestamp=ts)
    ems = EntityMentionSet(uuid=generate_UUID(), metadata=meta_ems,
                           mentionList=[em])
    meta_prop = AnnotationMetadata(tool='Annotator1',
                                   timestamp=ts)
    props = list(
        Property(
            value="Property%d" % i,
            metadata=meta_prop,
            polarity=4.0) for i in range(num_properties))
    am = MentionArgument(role='role', entityMentionId=em.uuid,
                         propertyList=props)
    sm = SituationMention(uuid=generate_UUID(),
                          tokens=trfs, argumentList=[am])
    meta_sms = AnnotationMetadata(tool='sms-tool', timestamp=ts)
    sms = SituationMentionSet(uuid=generate_UUID(), metadata=meta_sms,
                              mentionList=[sm])
    meta_comm = AnnotationMetadata(tool='tool', timestamp=ts)
    comm = Communication(uuid=generate_UUID(), id='id', text='text',
                         type='type', metadata=meta_comm,
                         sectionList=[section],
                         situationMentionSetList=[sms],
                         entityMentionSetList=[ems])
    add_references_to_communication(comm)
    return comm
Example #9
0
 def _next_from_tar(self):
     while True:
         tarinfo = self.tar.next()
         if tarinfo is None:
             raise StopIteration
         if not tarinfo.isfile():
             # Ignore directories
             continue
         filename = os.path.split(tarinfo.name)[-1]
         if filename[0] is '.' and filename[1] is '_':
             # Ignore attribute files created by OS X tar
             continue
         comm = TSerialization.deserialize(
             Communication(),
             self.tar.extractfile(tarinfo).read(),
             protocol_factory=factory.protocolFactory)
         if self._add_references:
             add_references_to_communication(comm)
         # hack to keep memory usage O(1)
         # (...but the real hack is tarfile :)
         self.tar.members = []
         return (comm, tarinfo.name)