Exemple #1
0
def test_validate_minimal_communication_with_uuid():
    comm = Communication()
    comm.id = "myID"
    comm.metadata = AnnotationMetadata(
        tool="TEST", timestamp=int(time.time()))
    comm.type = "Test Communication"
    comm.uuid = generate_UUID()
    assert validate_communication(comm)
 def _next_from_stream(self):
     try:
         comm = Communication()
         comm.read(self.protocol)
         if self._add_references:
             add_references_to_communication(comm)
         return (comm, self._source_filename)
     except EOFError:
         self.transport.close()
         raise StopIteration
def create_simple_comm(comm_id, sentence_string="Super simple sentence ."):
    """Create a simple (valid) Communication suitable for testing purposes

    The Communication will have a single Section containing a single
    Sentence.

    Args:

    - `comm_id`: A string specifying a Communication ID
    - `sentence_string`: A string to be used for the sentence text.
       The string will be whitespace-tokenized.

    Returns:

    - A Concrete Communication object
    """
    logging.warning('create_simple_comm will be removed in a future'
                    ' release, please use create_comm instead')

    toolname = "TEST"
    timestamp = int(time.time())

    augf = AnalyticUUIDGeneratorFactory()
    aug = augf.create()

    comm = Communication(id=comm_id,
                         metadata=AnnotationMetadata(tool=toolname,
                                                     timestamp=timestamp),
                         type=toolname,
                         uuid=aug.next())

    tokenization = Tokenization(kind=TokenizationKind.TOKEN_LIST,
                                metadata=AnnotationMetadata(
                                    tool=toolname, timestamp=timestamp),
                                tokenList=TokenList(tokenList=[]),
                                uuid=aug.next())
    token_string_list = sentence_string.split()
    for i, token_string in enumerate(token_string_list):
        tokenization.tokenList.tokenList.append(
            Token(text=token_string, tokenIndex=i))

    sentence = Sentence(textSpan=TextSpan(0, len(sentence_string)),
                        tokenization=tokenization,
                        uuid=aug.next())

    section = Section(kind="SectionKind",
                      sentenceList=[sentence],
                      textSpan=TextSpan(0, len(sentence_string)),
                      uuid=aug.next())

    comm.sectionList = [section]
    comm.text = sentence_string

    return comm
Exemple #4
0
def read_communication_from_buffer(buf, add_references=True):
    '''
    Deserialize buf (a binary string) and return resulting
    communication.  Add references if requested.
    '''
    transport_in = TMemoryBuffer(buf)
    protocol_in = factory.createProtocol(transport_in)
    comm = Communication()
    comm.read(protocol_in)
    if add_references:
        add_references_to_communication(comm)
    return comm
Exemple #5
0
def read_communication_from_buffer(buf, add_references=True):
    '''
    Deserialize buf (a binary string) and return resulting
    communication.  Add references if requested.
    '''
    transport_in = TMemoryBuffer(buf)
    protocol_in = factory.createProtocol(transport_in)
    comm = Communication()
    comm.read(protocol_in)
    if add_references:
        add_references_to_communication(comm)
    return comm
Exemple #6
0
def main():
    set_stdout_encoding()

    parser = make_parser()
    args = parser.parse_args()

    logging.basicConfig(format='%(asctime)-15s %(levelname)s: %(message)s',
                        level=args.loglevel.upper())

    mimetypes.init()
    (ifile_type, ifile_encoding) = mimetypes.guess_type(args.input_file)
    (ofile_type, ofile_encoding) = mimetypes.guess_type(args.output_file)
    out_writer = None
    if args.direction is None:
        if args.iprotocol is None or args.oprotocol is None:
            print("Either --direction, or both --iprotocol and --oprotocol,"
                  " must be provided")
            exit(1)
    else:
        if (args.iprotocol is not None) or (args.oprotocol is not None):
            print("Not both --direction, and either --iprotocol or"
                  " --oprotocol, can be provided")
            exit(1)
    encoding_input = KNOWN_CONVERSIONS[
        args.direction][0] if args.iprotocol is None else PROTOCOLS[
            args.iprotocol]
    encoding_output = KNOWN_CONVERSIONS[
        args.direction][1] if args.oprotocol is None else PROTOCOLS[
            args.oprotocol]
    if ofile_encoding == "gzip":
        out_writer = gzip.GzipFile(args.output_file, 'wb')
    else:
        out_writer = open(args.output_file, 'w')
    if ifile_encoding == 'gzip':
        f = gzip.GzipFile(args.input_file)
        transportIn = TTransport.TFileObjectTransport(f)
        protocolIn = encoding_input().getProtocol(transportIn)
        while True:
            try:
                comm = Communication()
                comm.read(protocolIn)
                output_bytes = TSerialization.serialize(
                    comm, protocol_factory=encoding_output())
                out_writer.write(output_bytes)
            except EOFError:
                break
        f.close()
    else:
        convert(input_file_path=args.input_file,
                output_file_path=args.output_file,
                input_protocol_factory=encoding_input,
                output_protocol_factory=encoding_output)
    out_writer.close()
def main():
    set_stdout_encoding()

    parser = make_parser()
    args = parser.parse_args()

    logging.basicConfig(format='%(asctime)-15s %(levelname)s: %(message)s',
                        level=args.loglevel.upper())

    mimetypes.init()
    (ifile_type, ifile_encoding) = mimetypes.guess_type(args.input_file)
    (ofile_type, ofile_encoding) = mimetypes.guess_type(args.output_file)
    out_writer = None
    if args.direction is None:
        if args.iprotocol is None or args.oprotocol is None:
            print("Either --direction, or both --iprotocol and --oprotocol,"
                  " must be provided")
            exit(1)
    else:
        if (args.iprotocol is not None) or (args.oprotocol is not None):
            print("Not both --direction, and either --iprotocol or"
                  " --oprotocol, can be provided")
            exit(1)
    encoding_input = KNOWN_CONVERSIONS[args.direction][
        0] if args.iprotocol is None else PROTOCOLS[args.iprotocol]
    encoding_output = KNOWN_CONVERSIONS[args.direction][
        1] if args.oprotocol is None else PROTOCOLS[args.oprotocol]
    if ofile_encoding == "gzip":
        out_writer = gzip.GzipFile(args.output_file, 'wb')
    else:
        out_writer = open(args.output_file, 'w')
    if ifile_encoding == 'gzip':
        f = gzip.GzipFile(args.input_file)
        transportIn = TTransport.TFileObjectTransport(f)
        protocolIn = encoding_input().getProtocol(transportIn)
        while True:
            try:
                comm = Communication()
                comm.read(protocolIn)
                output_bytes = TSerialization.serialize(
                    comm, protocol_factory=encoding_output())
                out_writer.write(output_bytes)
            except EOFError:
                break
        f.close()
    else:
        convert(input_file_path=args.input_file,
                output_file_path=args.output_file,
                input_protocol_factory=encoding_input,
                output_protocol_factory=encoding_output)
    out_writer.close()
Exemple #8
0
def createComm(cid, ctype, txt):
    '''Creates concrete communication file from text'''
    comm = Communication()
    comm.id = cid
    comm.uuid = concrete.util.generate_UUID()
    comm.type = ctype
    txt = re.sub('[\xa0\xc2]', ' ', txt)
    txt = re.sub(r'\s*\n\s*', '\n', txt)
    if not txt.strip():
        return None
    comm.text = txt
    comm.metadata = create_dummy_annotation()

    breaks = [
        i for i, ch in enumerate(txt)
        if ch == '\n' and i > 0 and txt[i - 1] != '\n'
    ]
    if not breaks or breaks[-1] != len(txt) - 1:
        breaks += [len(txt)]

    sections = []
    start = 0
    for i in breaks:
        sec = concrete.Section()
        sec.uuid = concrete.util.generate_UUID()
        sec.kind = "Passage"
        sec.textSpan = concrete.TextSpan(start, i)
        sections.append(sec)
        start = i

    comm.sectionList = sections

    if not concrete.validate.validate_communication(comm):
        return None
    return comm
Exemple #9
0
 def commFromData(data):
     '''Returns Communication object generated from byte string'''
     comm = Communication()
     TSerialization.deserialize(
         comm,
         data,
         protocol_factory=TCompactProtocol.TCompactProtocolFactory())
     return comm
 def _next_from_zip(self):
     if self.zip_infolist_index >= len(self.zip_infolist):
         raise StopIteration
     zipinfo = self.zip_infolist[self.zip_infolist_index]
     self.zip_infolist_index += 1
     comm = TSerialization.deserialize(
         Communication(),
         self.zip.open(zipinfo).read(),
         protocol_factory=factory.protocolFactory)
     if self._add_references:
         add_references_to_communication(comm)
     return (comm, zipinfo.filename)
Exemple #11
0
def test_validate_minimal_communication_with_uuid():
    comm = Communication()
    comm.id = "myID"
    comm.metadata = AnnotationMetadata(tool="TEST", timestamp=int(time.time()))
    comm.type = "Test Communication"
    comm.uuid = generate_UUID()
    assert validate_communication(comm)
def json_to_concrete(doc: Dict) -> Communication:
    metadata = AnnotationMetadata(
        tool="BlingBLing",
        timestamp=int(datetime.datetime.now().timestamp())
    )
    comm: Communication = Communication(
        uuid=augf.next(),
        id=doc['doc_key'],
        type="aida",
        metadata=metadata,
        lidList=[LanguageIdentification(
            uuid=augf.next(),
            metadata=metadata,
            languageToProbabilityMap={doc['language_id']: 1.0}
        )],
        sectionList=[Section(
            uuid=augf.next(),
            kind="passage",
            sentenceList=[
                Sentence(
                    uuid=augf.next(),
                    tokenization=Tokenization(
                        uuid=augf.next(),
                        kind=TokenizationKind.TOKEN_LIST,
                        metadata=metadata,
                        tokenList=TokenList(
                            tokenList=[
                                Token(
                                    tokenIndex=i,
                                    text=t
                                )
                                for i, t in enumerate(get_flatten_sentence(doc))
                            ]
                        )
                    )
                )
            ]
        )],
        entityMentionSetList=[EntityMentionSet(
            uuid=augf.next(),
            metadata=metadata,
            mentionList=[]
        )],
        situationMentionSetList=[SituationMentionSet(
            uuid=augf.next(),
            metadata=metadata,
            mentionList=[]
        )]
    )

    return comm
Exemple #13
0
def _comm_with_properties(num_properties):
    ts = 17
    meta_tokn = AnnotationMetadata(tool='tokn-tool', timestamp=ts)
    toks = TokenList(tokenList=[
        Token(tokenIndex=0, text='text', textSpan=TextSpan(start=0, ending=1))
    ])
    tokn = Tokenization(uuid=generate_UUID(),
                        metadata=meta_tokn,
                        kind=TokenizationKind.TOKEN_LIST,
                        tokenList=toks)
    sentence = Sentence(uuid=generate_UUID(), tokenization=tokn)
    section = Section(uuid=generate_UUID(),
                      kind='kind',
                      label='label',
                      sentenceList=[sentence])
    trfs = TokenRefSequence(tokenizationId=tokn.uuid,
                            tokenIndexList=[0],
                            anchorTokenIndex=0)
    em = EntityMention(uuid=generate_UUID(),
                       entityType='entityType',
                       text='text',
                       tokens=trfs)
    meta_ems = AnnotationMetadata(tool='ems-tool', timestamp=ts)
    ems = EntityMentionSet(uuid=generate_UUID(),
                           metadata=meta_ems,
                           mentionList=[em])
    meta_prop = AnnotationMetadata(tool='Annotator1', timestamp=ts)
    props = list(
        Property(value="Property%d" % i, metadata=meta_prop, polarity=4.0)
        for i in range(num_properties))
    am = MentionArgument(role='role',
                         entityMentionId=em.uuid,
                         propertyList=props)
    sm = SituationMention(uuid=generate_UUID(), tokens=trfs, argumentList=[am])
    meta_sms = AnnotationMetadata(tool='sms-tool', timestamp=ts)
    sms = SituationMentionSet(uuid=generate_UUID(),
                              metadata=meta_sms,
                              mentionList=[sm])
    meta_comm = AnnotationMetadata(tool='tool', timestamp=ts)
    comm = Communication(uuid=generate_UUID(),
                         id='id',
                         text='text',
                         type='type',
                         metadata=meta_comm,
                         sectionList=[section],
                         situationMentionSetList=[sms],
                         entityMentionSetList=[ems])
    add_references_to_communication(comm)
    return comm
Exemple #14
0
def json_tweet_object_to_Communication(tweet):
    """
    """
    tweet_info = json_tweet_object_to_TweetInfo(tweet)

    augf = AnalyticUUIDGeneratorFactory()
    aug = augf.create()
    if 'id_str' in tweet:
        tweet_id = tweet['id_str']
    else:
        logging.warning('Tweet has no id_str, leaving communication id blank')
        tweet_id = None
    tweet_time = datetime_to_timestamp(datetime.strptime(tweet_info.createdAt,
                                                         CREATED_AT_FORMAT))
    comm = Communication(
        communicationMetadata=CommunicationMetadata(
            tweetInfo=tweet_info),
        metadata=AnnotationMetadata(
            tool=TOOL_NAME,
            timestamp=int(time.time())),
        originalText=tweet_info.text,
        text=tweet_info.text,
        type=TWEET_TYPE,
        uuid=aug.next(),
        startTime=tweet_time,
        endTime=tweet_time,
        id=tweet_id
    )

    # either this, or pass in gen as parameter to fx
    # latter is more annoying to test but slightly cleaner
    if tweet_info.lid is not None:
        tweet_info.lid.uuid = aug.next()
        lidList = [tweet_info.lid]
        comm.lidList = lidList
    return comm
def json_tweet_object_to_Communication(tweet):
    """
    """
    tweet_info = json_tweet_object_to_TweetInfo(tweet)

    augf = AnalyticUUIDGeneratorFactory()
    aug = augf.create()
    if 'id_str' in tweet:
        tweet_id = tweet['id_str']
    else:
        logging.warning('Tweet has no id_str, leaving communication id blank')
        tweet_id = None
    tweet_time = unix_time(datetime.strptime(tweet_info.createdAt,
                                             CREATED_AT_FORMAT))
    comm = Communication(
        communicationMetadata=CommunicationMetadata(
            tweetInfo=tweet_info),
        metadata=AnnotationMetadata(
            tool=TOOL_NAME,
            timestamp=int(time.time())),
        originalText=tweet_info.text,
        text=tweet_info.text,
        type=TWEET_TYPE,
        uuid=aug.next(),
        startTime=tweet_time,
        endTime=tweet_time,
        id=tweet_id
    )

    # either this, or pass in gen as parameter to fx
    # latter is more annoying to test but slightly cleaner
    if tweet_info.lid is not None:
        tweet_info.lid.uuid = aug.next()
        lidList = [tweet_info.lid]
        comm.lidList = lidList
    return comm
def read_communication_from_file(communication_filename, add_references=True):
    """Read a Communication from the file specified by filename

    Args:

    - `communication_filename`: String with filename

    Returns:

    - A Concrete `Communication` object
    """
    comm = read_thrift_from_file(Communication(), communication_filename)
    if add_references:
        add_references_to_communication(comm)
    return comm
def create_comm(comm_id,
                text='',
                comm_type='article',
                section_kind='passage',
                metadata_tool='concrete-python',
                metadata_timestamp=None,
                annotation_level=AL_TOKEN):
    '''
    Create a simple, valid Communication from text.
    By default the text will be split by double-newlines into sections
    and then by single newlines into sentences within those sections.

    annotation_level controls the amount of annotation that is added:
      AL_NONE      add no optional annotations (not even sections)
      AL_SECTION   add sections but not sentences
      AL_SENTENCE  add sentences but not tokens
      AL_TOKEN     add all annotations, up to tokens (the default)

    If metadata_timestamp is None, the current time will be used.
    '''

    if metadata_timestamp is None:
        metadata_timestamp = int(time.time())

    augf = AnalyticUUIDGeneratorFactory()
    aug = augf.create()

    sections = (annotation_level is not None) and (annotation_level != AL_NONE)

    return Communication(
        id=comm_id,
        uuid=aug.next(),
        type=comm_type,
        text=text,
        metadata=AnnotationMetadata(
            tool=metadata_tool,
            timestamp=metadata_timestamp,
        ),
        sectionList=([
            create_section(sec_text, sec_start, sec_end, section_kind, aug,
                           metadata_tool, metadata_timestamp, annotation_level)
            for (sec_text, sec_start, sec_end) in _split(text, '\n\n')
        ] if text.strip() else []) if sections else None,
    )
Exemple #18
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('csv_file')
    parser.add_argument('comms_zip_file')
    parser.add_argument('--comm-field', default='Answer.modified_comm')
    args = parser.parse_args()

    csv_fh = open(args.csv_file, 'rb')
    reader = unicodecsv.DictReader(csv_fh)

    with CommunicationWriterZip(args.comms_zip_file) as writer:
        for row in reader:
            json_comm = row[args.comm_field]
            comm = Communication()
            TSerialization.deserialize(
                comm,
                json_comm.encode('utf-8'),
                protocol_factory=TJSONProtocol.TJSONProtocolFactory())
            writer.write(comm, comm.id + '.comm')
def convert_communication(input_bytes, input_protocol_factory,
                          output_protocol_factory):
    """
    Convert an input byte stream (to be read in as an
    input_protocol_factory type) to an output byte stream (with encoding
    output_protocol_factory type).

    * input_bytes: Input file byte stream
    * input_protocol_factory: Callable factory function for input
      encoding, e.g., TBinaryProtocol.TBinaryProtocolFactory.
    * output_protocol_factory: Callable factory function for output
      encoding, e.g., TCompactProtocol.TCompactProtocolFactory.
    """
    comm = Communication()
    TSerialization.deserialize(comm,
                               input_bytes,
                               protocol_factory=input_protocol_factory())
    output_bytes = TSerialization.serialize(
        comm, protocol_factory=output_protocol_factory())
    return output_bytes
Exemple #20
0
def createComm(fn):
    with codecs.open(fn, 'r', 'utf-8') as f:
        txt = f.read()

    comm = Communication()
    comm.id = fn
    comm.uuid = concrete.util.generate_UUID()
    comm.type = "QUORA ANSWER" if fn.split('/')[-1].startswith(
        "answer") else "QUORA QUESTION"
    txt = re.sub('[\xa0\xc2]', ' ', txt)
    txt = re.sub(r'\s*\n\s*', '\n', txt)
    if not txt.strip():
        return None
    comm.text = txt
    comm.metadata = create_dummy_annotation()

    breaks = [
        i for i, ch in enumerate(txt)
        if ch == '\n' and i > 0 and txt[i - 1] != '\n'
    ]
    if not breaks or breaks[-1] != len(txt) - 1:
        breaks += [len(txt)]

    sections = []
    start = 0
    for i in breaks:
        sec = concrete.Section()
        sec.uuid = concrete.util.generate_UUID()
        sec.kind = "Passage"
        sec.textSpan = concrete.TextSpan(start, i)
        sections.append(sec)
        start = i

    comm.sectionList = sections

    if not concrete.validate.validate_communication(comm):
        return None
    return comm
 def _next_from_tar(self):
     while True:
         tarinfo = self.tar.next()
         if tarinfo is None:
             raise StopIteration
         if not tarinfo.isfile():
             # Ignore directories
             continue
         filename = os.path.split(tarinfo.name)[-1]
         if filename[0] is '.' and filename[1] is '_':
             # Ignore attribute files created by OS X tar
             continue
         comm = TSerialization.deserialize(
             Communication(),
             self.tar.extractfile(tarinfo).read(),
             protocol_factory=factory.protocolFactory)
         if self._add_references:
             add_references_to_communication(comm)
         # hack to keep memory usage O(1)
         # (...but the real hack is tarfile :)
         self.tar.members = []
         return (comm, tarinfo.name)
Exemple #22
0
    def index():
        text = request.forms.get('text')
        transport = TTransport.TFramedTransport(
            TSocket.TSocket(options.annotator_host, options.annotator_port))
        protocol = TCompactProtocol.TCompactProtocol(transport)
        client = Annotator.Client(protocol)
        transport.open()
        augf = AnalyticUUIDGeneratorFactory()
        aug = augf.create()
        c = Communication(
            id="",
            text=text,
            uuid=aug.next(),
            type="user-supplied input",
            metadata=AnnotationMetadata(timestamp=int(time.time()),
                                        tool="stdin"),
            sectionList=[
                Section(uuid=aug.next(),
                        sentenceList=[],
                        kind="paragraph",
                        textSpan=TextSpan(start=0, ending=len(text)))
            ],
            entitySetList=[],
            entityMentionSetList=[],
        )

        new_c = client.annotate(c)
        form = '''<form action="/" method="post">
        Enter or paste some text: <input name="text" type="text" />
        <input value="Submit" type="submit" />
        </form>
        '''
        return form + "\n".join(["<h3>%s</h3>" % text] + [
            "\n".join([
                "<br>%s %s" % (e.type, e.canonicalName) for e in es.entityList
            ]) for es in new_c.entitySetList
        ])
Exemple #23
0
                   " --oprotocol, can be provided")
            exit(1)
    encoding_input = KNOWN_CONVERSIONS[args.direction][
        0] if args.iprotocol is None else PROTOCOLS[args.iprotocol]
    encoding_output = KNOWN_CONVERSIONS[args.direction][
        1] if args.oprotocol is None else PROTOCOLS[args.oprotocol]
    if ofile_encoding == "gzip":
        out_writer = gzip.GzipFile(args.output_file, 'wb')
    else:
        out_writer = open(args.output_file, 'w')
    if ifile_encoding == 'gzip':
        f = gzip.GzipFile(args.input_file)
        transportIn = TTransport.TFileObjectTransport(f)
        protocolIn = encoding_input().getProtocol(transportIn)
        while True:
            try:
                comm = Communication()
                comm.read(protocolIn)
                output_bytes = TSerialization.serialize(
                    comm, protocol_factory=encoding_output())
                out_writer.write(output_bytes)
            except EOFError:
                break
        f.close()
    else:
        convert(input_file_path=args.input_file,
                output_file_path=args.output_file,
                input_protocol_factory=encoding_input,
                output_protocol_factory=encoding_output)
    out_writer.close()
    encoding_input = KNOWN_CONVERSIONS[
        args.direction][0] if args.iprotocol is None else PROTOCOLS[
            args.iprotocol]
    encoding_output = KNOWN_CONVERSIONS[
        args.direction][1] if args.oprotocol is None else PROTOCOLS[
            args.oprotocol]
    if ofile_encoding == "gzip":
        out_writer = gzip.GzipFile(args.output_file, 'wb')
    else:
        out_writer = open(args.output_file, 'w')
    if ifile_encoding == 'gzip':
        f = gzip.GzipFile(args.input_file)
        transportIn = TTransport.TFileObjectTransport(f)
        protocolIn = encoding_input().getProtocol(transportIn)
        while True:
            try:
                comm = Communication()
                comm.read(protocolIn)
                output_bytes = TSerialization.serialize(
                    comm, protocol_factory=encoding_output())
                out_writer.write(output_bytes)
            except EOFError:
                break
        f.close()
    else:
        convert(input_file_path=args.input_file,
                output_file_path=args.output_file,
                input_protocol_factory=encoding_input,
                output_protocol_factory=encoding_output)
    out_writer.close()
Exemple #25
0
 def test_next(self):
     comm = Communication()
     comm.uuid = generate_UUID()
Exemple #26
0
def test_generate_UUID():
    comm = Communication()
    comm.uuid = generate_UUID()
 def test_next(self):
     comm = Communication()
     comm.uuid = generate_UUID()
Exemple #28
0
def test_generate_UUID():
    comm = Communication()
    comm.uuid = generate_UUID()
    parser.add_argument("-p", "--port", dest="port", type=int, default=9090)
    parser.add_argument("-H", "--host", dest="host", default="localhost")
    options = parser.parse_args()

    # Make socket
    transport = TSocket.TSocket(options.host, options.port)

    # Buffering is critical. Raw sockets are very slow
    transport = TTransport.TBufferedTransport(transport)

    # Wrap in a protocol
    protocol = TCompactProtocol.TCompactProtocol(transport)

    # Create a client to use the protocol encoder
    client = Annotator.Client(protocol)
    
    # Connect!
    transport.open()

    while True:
        s = raw_input("Write some text > ")
        if re.match(r"^\s*$", s):
            break
        else:
            augf = AnalyticUUIDGeneratorFactory()
            aug = augf.create()
            c = Communication(id="", text=s, uuid=aug.next(), type="tweet", metadata=AnnotationMetadata(timestamp=0, tool="stdin"), lidList=[])

            new_c = client.annotate(c)
            print new_c
Exemple #30
0
def create_simple_comm(comm_id, sentence_string="Super simple sentence ."):
    """Create a simple (valid) Communication suitable for testing purposes

    The Communication will have a single Section containing a single
    Sentence.

    Args:

    - `comm_id`: A string specifying a Communication ID
    - `sentence_string`: A string to be used for the sentence text.
       The string will be whitespace-tokenized.

    Returns:

    - A Concrete Communication object
    """
    logging.warning('create_simple_comm will be removed in a future'
                    ' release, please use create_comm instead')

    toolname = "TEST"
    timestamp = int(time.time())

    augf = AnalyticUUIDGeneratorFactory()
    aug = augf.create()

    comm = Communication(
        id=comm_id,
        metadata=AnnotationMetadata(tool=toolname, timestamp=timestamp),
        type=toolname,
        uuid=aug.next()
    )

    tokenization = Tokenization(
        kind=TokenizationKind.TOKEN_LIST,
        metadata=AnnotationMetadata(tool=toolname, timestamp=timestamp),
        tokenList=TokenList(
            tokenList=[]),
        uuid=aug.next()
    )
    token_string_list = sentence_string.split()
    for i, token_string in enumerate(token_string_list):
        tokenization.tokenList.tokenList.append(Token(text=token_string,
                                                      tokenIndex=i))

    sentence = Sentence(
        textSpan=TextSpan(0, len(sentence_string)),
        tokenization=tokenization,
        uuid=aug.next()
    )

    section = Section(
        kind="SectionKind",
        sentenceList=[sentence],
        textSpan=TextSpan(0, len(sentence_string)),
        uuid=aug.next()
    )

    comm.sectionList = [section]
    comm.text = sentence_string

    return comm
Exemple #31
0
    transport.open()

    while True:
        s = raw_input("Write some text > ")
        if re.match(r"^\s*$", s):
            break
        else:
            augf = AnalyticUUIDGeneratorFactory()
            aug = augf.create()
            c = Communication(
                id="",
                text=s,
                uuid=aug.next(),
                type="user-supplied input",
                metadata=AnnotationMetadata(timestamp=int(time.time()),
                                            tool="stdin"),
                sectionList=[
                    Section(uuid=aug.next(),
                            sentenceList=[],
                            kind="paragraph",
                            textSpan=TextSpan(start=0, ending=len(s)))
                ],
                entitySetList=[],
                entityMentionSetList=[],
            )

            new_c = client.annotate(c)
            for es in new_c.entitySetList:
                for e in es.entityList:
                    print "%s %s" % (e.type, e.canonicalName)
Exemple #32
0
    ofd = CommunicationWriterTGZ(options.output)
    with reader(gzip.open(options.input)) as ifd:
        for i, line in enumerate(ifd):
            toks = line.strip().split("\t")
            if len(toks) != 3:
                continue            
            cid, label, text = toks
            g = ugf.create()
            t = int(time())
            comm = Communication(id=cid,
                                 uuid=g.next(),
                                 type="Text document",
                                 text=text,
                                 communicationTaggingList=[CommunicationTagging(uuid=g.next(),
                                                                                metadata=AnnotationMetadata(tool="Gold labeling",
                                                                                                            timestamp=t,
                                                                                                            kBest=1,
                                                                                ),
                                                                                taggingType=options.tag_type,
                                                                                tagList=[label],
                                                                                confidenceList=[1.0],
                                 )],
                                 metadata=AnnotationMetadata(tool="text_to_concrete.py ingester", timestamp=t, kBest=1),
                                 sectionList=[Section(uuid=g.next(),
                                                      textSpan=TextSpan(start=0, ending=len(text)),
                                                      kind="content",
                                                      )
                                 ])
            ofd.write(comm)
    ofd.close()