def setUp(self): self.comm1 = create_comm('comm-1') self.comm2 = create_comm('comm-2') self.comm3 = create_comm('comm-3') self.buf1 = write_communication_to_buffer(self.comm1) self.buf2 = write_communication_to_buffer(self.comm2) self.buf3 = write_communication_to_buffer(self.comm3)
def load(path, per_line, annotation_level): ''' Generate communications constructed from text files in specified tarball, assigning ids that are meaningful tar-friendly filenames. If per_line is True: One communication is created for each newline in a file. Note blank lines will produce communications. The trailing newline is included in the communication text. If a file does not have a terminating newline, a communication is nonetheless produced for the last line, and a newline is appended to the end of the text. ''' with tarfile.open(path, 'r|*') as tf: ti = tf.next() while ti is not None: if ti.isfile(): f = tf.extractfile(ti) text = f.read().decode('utf-8') if per_line: if text.endswith('\n'): text = text[:-1] for (i, line) in enumerate(text.split('\n')): yield create_comm(u'%s/%d' % (ti.name, i), line + u'\n', annotation_level=annotation_level) else: yield create_comm(ti.name, text, annotation_level=annotation_level) tf.members = [] ti = tf.next()
def main(): parser = ArgumentParser( formatter_class=ArgumentDefaultsHelpFormatter, description='Convert text file to communication', ) parser.set_defaults(annotation_level=AL_NONE) parser.add_argument('text_path', type=str, help='Input text file path (- for stdin)') parser.add_argument('concrete_path', type=str, help='Output concrete file path (- for stdout)') add_annotation_level_argparse_argument(parser) concrete.version.add_argparse_argument(parser) ns = parser.parse_args() # Won't work on Windows... but that use case is very unlikely text_path = '/dev/fd/0' if ns.text_path == '-' else ns.text_path concrete_path = ( '/dev/fd/1' if ns.concrete_path == '-' else ns.concrete_path ) annotation_level = ns.annotation_level with codecs.open(text_path, encoding='utf-8') as f: comm = create_comm(text_path, f.read(), annotation_level=annotation_level) write_communication_to_file(comm, concrete_path)
def main(): set_stdout_encoding() parser = ArgumentParser( formatter_class=ArgumentDefaultsHelpFormatter, description='Convert text file to communication', ) parser.set_defaults(annotation_level=AL_NONE) parser.add_argument('text_path', type=str, help='Input text file path (- for stdin)') parser.add_argument('concrete_path', type=str, help='Output concrete file path (- for stdout)') add_annotation_level_argparse_argument(parser) parser.add_argument('-l', '--loglevel', '--log-level', help='Logging verbosity level threshold (to stderr)', default='info') concrete.version.add_argparse_argument(parser) args = parser.parse_args() logging.basicConfig(format='%(asctime)-15s %(levelname)s: %(message)s', level=args.loglevel.upper()) # Won't work on Windows text_path = '/dev/fd/0' if args.text_path == '-' else args.text_path concrete_path = ( '/dev/fd/1' if args.concrete_path == '-' else args.concrete_path ) annotation_level = args.annotation_level with codecs.open(text_path, encoding='utf-8') as f: comm = create_comm(text_path, f.read(), annotation_level=annotation_level) write_communication_to_file(comm, concrete_path)
def main(): parser = ArgumentParser( formatter_class=ArgumentDefaultsHelpFormatter, description='Convert text file to communication', ) parser.set_defaults(annotation_level=AL_NONE) parser.add_argument('text_path', type=str, help='Input text file path (- for stdin)') parser.add_argument('concrete_path', type=str, help='Output concrete file path (- for stdout)') add_annotation_level_argparse_argument(parser) concrete.version.add_argparse_argument(parser) ns = parser.parse_args() # Won't work on Windows... but that use case is very unlikely text_path = '/dev/fd/0' if ns.text_path == '-' else ns.text_path concrete_path = ('/dev/fd/1' if ns.concrete_path == '-' else ns.concrete_path) annotation_level = ns.annotation_level with codecs.open(text_path, encoding='utf-8') as f: comm = create_comm(text_path, f.read(), annotation_level=annotation_level) write_communication_to_file(comm, concrete_path)
def test_create_comm_complex_al_none(): comm = create_comm('one', '\n\nsimple comm\t\t.\nor ...\n\nisit?\n', annotation_level=AL_NONE) assert 'one' == comm.id assert '\n\nsimple comm\t\t.\nor ...\n\nisit?\n' == comm.text assert comm.sectionList is None assert validate_communication(comm)
def test_create_comm_complex_al_section(): comm = create_comm('one', '\n\nsimple comm\t\t.\nor ...\n\nisit?\n', annotation_level=AL_SECTION) assert 'one' == comm.id assert '\n\nsimple comm\t\t.\nor ...\n\nisit?\n' == comm.text assert 3 == len(comm.sectionList) sect = comm.sectionList[0] assert 0 == sect.textSpan.start assert 0 == sect.textSpan.ending assert sect.sentenceList is None sect = comm.sectionList[1] assert 2 == sect.textSpan.start assert 23 == sect.textSpan.ending assert sect.sentenceList is None sect = comm.sectionList[2] assert 25 == sect.textSpan.start assert 31 == sect.textSpan.ending assert sect.sentenceList is None assert validate_communication(comm)
def test_read_write_fixed_point(self): comm = create_comm('comm-1') buf_1 = write_communication_to_buffer(comm) buf_2 = write_communication_to_buffer( read_communication_from_buffer(buf_1) ) self.assertEquals(buf_1, buf_2)
def test_annotate(self): impl = NoopAnnotator() host = 'localhost' port = find_port() timeout = 5 comm_id = '1-2-3-4' comm = create_comm(comm_id) comm_uuid_uuidString = comm.uuid.uuidString comm_metadata_tool = comm.metadata.tool comm_metadata_timestamp = comm.metadata.timestamp with SubprocessAnnotatorServiceWrapper(impl, host, port, timeout=timeout): transport = TSocket.TSocket(host, port) transport = TTransport.TFramedTransport(transport) protocol = TCompactProtocol.TCompactProtocol(transport) cli = Annotator.Client(protocol) transport.open() res = cli.annotate(comm) transport.close() self.assertEqual(res.id, comm_id) self.assertEqual(res.uuid.uuidString, comm_uuid_uuidString) self.assertEqual(res.metadata.tool, comm_metadata_tool) self.assertEqual(res.metadata.timestamp, comm_metadata_timestamp)
def test_annotate(self): impl = NoopAnnotateCommunicationService() host = 'localhost' port = find_port() timeout = 5 comm_id = '1-2-3-4' comm = create_comm(comm_id) comm_uuid_uuidString = comm.uuid.uuidString comm_metadata_tool = comm.metadata.tool comm_metadata_timestamp = comm.metadata.timestamp with SubprocessAnnotateCommunicationServiceWrapper(impl, host, port, timeout=timeout): transport = TSocket.TSocket(host, port) transport = TTransport.TFramedTransport(transport) protocol = TCompactProtocol.TCompactProtocolAccelerated(transport) cli = AnnotateCommunicationService.Client(protocol) transport.open() res = cli.annotate(comm) transport.close() self.assertEqual(res.id, comm_id) self.assertEqual(res.uuid.uuidString, comm_uuid_uuidString) self.assertEqual(res.metadata.tool, comm_metadata_tool) self.assertEqual(res.metadata.timestamp, comm_metadata_timestamp)
def test_create_comm_one_sentence_al_section(): comm = create_comm('one', 'simple comm\t\t.', annotation_level=AL_SECTION) assert 'one' == comm.id assert 'simple comm\t\t.' == comm.text assert 1 == len(comm.sectionList) sect = comm.sectionList[0] assert 0 == sect.textSpan.start assert 14 == sect.textSpan.ending assert sect.sentenceList is None assert validate_communication(comm)
def test_read_write_fixed_point(self): key = 'comm' comm = create_comm('comm-1') with RedisServer(loglevel='warning') as server: redis_db = Redis(port=server.port) buf_1 = write_communication_to_redis_key(redis_db, key, comm) buf_2 = write_communication_to_redis_key( redis_db, key, read_communication_from_redis_key(redis_db, key) ) self.assertEquals(buf_1, buf_2)
def test_create_comm_complex(): comm = create_comm('one', '\n\nsimple comm\t\t.\nor ...\n\nisit?\n') assert 'one' == comm.id assert '\n\nsimple comm\t\t.\nor ...\n\nisit?\n' == comm.text assert 3 == len(comm.sectionList) sect = comm.sectionList[0] assert 0 == sect.textSpan.start assert 0 == sect.textSpan.ending assert 0 == len(sect.sentenceList) sect = comm.sectionList[1] assert 2 == sect.textSpan.start assert 23 == sect.textSpan.ending assert 2 == len(sect.sentenceList) sent = sect.sentenceList[0] assert 2 == sent.textSpan.start assert 16 == sent.textSpan.ending tl = sent.tokenization.tokenList.tokenList assert 3 == len(tl) assert 0 == tl[0].tokenIndex assert 'simple' == tl[0].text assert 1 == tl[1].tokenIndex assert 'comm' == tl[1].text assert 2 == tl[2].tokenIndex assert '.' == tl[2].text sent = sect.sentenceList[1] assert 17 == sent.textSpan.start assert 23 == sent.textSpan.ending tl = sent.tokenization.tokenList.tokenList assert 2 == len(tl) assert 0 == tl[0].tokenIndex assert 'or' == tl[0].text assert 1 == tl[1].tokenIndex assert '...' == tl[1].text sect = comm.sectionList[2] assert 25 == sect.textSpan.start assert 31 == sect.textSpan.ending assert 2 == len(sect.sentenceList) sent = sect.sentenceList[0] assert 25 == sent.textSpan.start assert 30 == sent.textSpan.ending tl = sent.tokenization.tokenList.tokenList assert 1 == len(tl) assert 0 == tl[0].tokenIndex assert 'isit?' == tl[0].text sent = sect.sentenceList[1] assert 31 == sent.textSpan.start assert 31 == sent.textSpan.ending tl = sent.tokenization.tokenList.tokenList assert 0 == len(tl) assert validate_communication(comm)
def test_lattice_with_token_list_kind(): comm = create_comm('comm-1', 'mambo no. 4') tokenization = comm.sectionList[0].sentenceList[0].tokenization lattice_path = LatticePath() lattice_path.tokenList = [Token(tokenIndex=0, text='mambo'), Token(tokenIndex=0, text='no.'), Token(tokenIndex=0, text='3')] token_lattice = TokenLattice() token_lattice.cachedBestPath = lattice_path tokenization.lattice = token_lattice token_texts = [t.text for t in get_tokens(tokenization)] assert ['mambo', 'no.', '4'] == token_texts
def test_communication_deep_copy(self): comm1 = create_comm('a-b-c', text='foo bar baz .') comm2 = communication_deep_copy(comm1) comm3 = communication_deep_copy(comm1) self.assert_simple_comms_equal(comm1, comm2) self.assert_simple_comms_equal(comm2, comm3) tkzn1 = comm1.sectionList[0].sentenceList[0].tokenization tkzn1.tokenList.tokenList[0] = Token(text='bbq', tokenIndex=0) tkzn2 = comm2.sectionList[0].sentenceList[0].tokenization self.assertNotEqual( map(lambda t: t.text, tkzn1.tokenList.tokenList), map(lambda t: t.text, tkzn2.tokenList.tokenList), ) self.assert_simple_comms_equal(comm2, comm3)
def test_create_comm_unicode_al_sentence(): comm = create_comm('one', u'狐狸\t\t.', annotation_level=AL_SENTENCE) assert 'one' == comm.id assert u'狐狸\t\t.' == comm.text assert 1 == len(comm.sectionList) sect = comm.sectionList[0] assert 0 == sect.textSpan.start assert 5 == sect.textSpan.ending assert 1 == len(sect.sentenceList) sent = sect.sentenceList[0] assert 0 == sent.textSpan.start assert 5 == sent.textSpan.ending assert sent.tokenization is None assert validate_communication(comm)
def test_lattice_with_token_list_kind(self): comm = create_comm('comm-1', 'mambo no. 4') tokenization = comm.sectionList[0].sentenceList[0].tokenization lattice_path = LatticePath() lattice_path.tokenList = [ Token(tokenIndex=0, text='mambo'), Token(tokenIndex=0, text='no.'), Token(tokenIndex=0, text='3') ] token_lattice = TokenLattice() token_lattice.cachedBestPath = lattice_path tokenization.lattice = token_lattice token_texts = [t.text for t in get_tokens(tokenization)] self.assertEqual(['mambo', 'no.', '4'], token_texts)
def test_create_comm_unicode(): comm = create_comm('one', u'狐狸\t\t.') assert 'one' == comm.id assert u'狐狸\t\t.' == comm.text assert 1 == len(comm.sectionList) sect = comm.sectionList[0] assert 0 == sect.textSpan.start assert 5 == sect.textSpan.ending assert 1 == len(sect.sentenceList) sent = sect.sentenceList[0] assert 0 == sent.textSpan.start assert 5 == sent.textSpan.ending tl = sent.tokenization.tokenList.tokenList assert 2 == len(tl) assert 0 == tl[0].tokenIndex assert u'狐狸' == tl[0].text assert 1 == tl[1].tokenIndex assert '.' == tl[1].text assert validate_communication(comm)
def test_create_comm_one_sentence(): comm = create_comm('one', 'simple comm\t\t.') assert 'one' == comm.id assert 'simple comm\t\t.' == comm.text assert 1 == len(comm.sectionList) sect = comm.sectionList[0] assert 0 == sect.textSpan.start assert 14 == sect.textSpan.ending assert 1 == len(sect.sentenceList) sent = sect.sentenceList[0] assert 0 == sent.textSpan.start assert 14 == sent.textSpan.ending tl = sent.tokenization.tokenList.tokenList assert 3 == len(tl) assert 0 == tl[0].tokenIndex assert 'simple' == tl[0].text assert 1 == tl[1].tokenIndex assert 'comm' == tl[1].text assert 2 == tl[2].tokenIndex assert '.' == tl[2].text assert validate_communication(comm)
def test_create_comm_complex_al_sentence(): comm = create_comm('one', '\n\nsimple comm\t\t.\nor ...\n\nisit?\n', annotation_level=AL_SENTENCE) assert 'one' == comm.id assert '\n\nsimple comm\t\t.\nor ...\n\nisit?\n' == comm.text assert 3 == len(comm.sectionList) sect = comm.sectionList[0] assert 0 == sect.textSpan.start assert 0 == sect.textSpan.ending assert 0 == len(sect.sentenceList) sect = comm.sectionList[1] assert 2 == sect.textSpan.start assert 23 == sect.textSpan.ending assert 2 == len(sect.sentenceList) sent = sect.sentenceList[0] assert 2 == sent.textSpan.start assert 16 == sent.textSpan.ending assert sent.tokenization is None sent = sect.sentenceList[1] assert 17 == sent.textSpan.start assert 23 == sent.textSpan.ending assert sent.tokenization is None sect = comm.sectionList[2] assert 25 == sect.textSpan.start assert 31 == sect.textSpan.ending assert 2 == len(sect.sentenceList) sent = sect.sentenceList[0] assert 25 == sent.textSpan.start assert 30 == sent.textSpan.ending assert sent.tokenization is None sent = sect.sentenceList[1] assert 31 == sent.textSpan.start assert 31 == sent.textSpan.ending assert sent.tokenization is None assert validate_communication(comm)
def main(): set_stdout_encoding() parser = ArgumentParser( formatter_class=ArgumentDefaultsHelpFormatter, description='Convert text file to communication', ) parser.set_defaults(annotation_level=AL_NONE) parser.add_argument('text_path', type=str, help='Input text file path (- for stdin)') parser.add_argument('concrete_path', type=str, help='Output concrete file path (- for stdout)') add_annotation_level_argparse_argument(parser) parser.add_argument('-l', '--loglevel', '--log-level', help='Logging verbosity level threshold (to stderr)', default='info') concrete.version.add_argparse_argument(parser) args = parser.parse_args() logging.basicConfig(format='%(asctime)-15s %(levelname)s: %(message)s', level=args.loglevel.upper()) # Won't work on Windows text_path = '/dev/fd/0' if args.text_path == '-' else args.text_path concrete_path = ('/dev/fd/1' if args.concrete_path == '-' else args.concrete_path) annotation_level = args.annotation_level with codecs.open(text_path, encoding='utf-8') as f: comm = create_comm(text_path, f.read(), annotation_level=annotation_level) write_communication_to_file(comm, concrete_path)
def test_no_lattice_with_no_kind(): comm = create_comm('comm-1', 'mambo no. 4') tokenization = comm.sectionList[0].sentenceList[0].tokenization token_texts = [t.text for t in get_tokens(tokenization)] assert ['mambo', 'no.', '4'] == token_texts
This Python script is secretly a shell script. """ import os from concrete.util import write_communication_to_file from concrete.util.simple_comm import create_comm text = 'Super simple sentence .' n1 = 'simple_1.concrete' n2 = 'simple_2.concrete' n3 = 'simple_3.concrete' write_communication_to_file(create_comm('one', text), n1) write_communication_to_file(create_comm('two', text), n2) write_communication_to_file(create_comm('three', text), n3) os.system('gzip < %s > %s.gz' % (n1, n1)) os.system('bzip2 < %s > %s.bz2' % (n1, n1)) os.system('cat %s %s %s > simple_concatenated' % (n1, n2, n3)) os.system('gzip < simple_concatenated > simple_concatenated.gz') os.system('bzip2 < simple_concatenated > simple_concatenated.bz2') os.system('tar -cf simple.tar %s %s %s' % (n1, n2, n3)) os.system('tar -czf simple.tar.gz %s %s %s' % (n1, n2, n3)) os.system('tar -cjf simple.tar.bz2 %s %s %s' % (n1, n2, n3)) os.system('zip simple.zip %s %s %s' % (n1, n2, n3)) os.system('mkdir -p a/b a/c') os.system('cp %s a/b/' % n1) os.system('cp %s %s a/c/' % (n2, n3))
def test_create_comm_ws_al_none(): comm = create_comm('one', '\t \t\r\n\n', annotation_level=AL_NONE) assert 'one' == comm.id assert '\t \t\r\n\n' == comm.text assert comm.sectionList is None assert validate_communication(comm)
def test_no_lattice_with_no_kind(self): comm = create_comm('comm-1', 'mambo no. 4') tokenization = comm.sectionList[0].sentenceList[0].tokenization token_texts = [t.text for t in get_tokens(tokenization)] self.assertEqual(['mambo', 'no.', '4'], token_texts)
def _add_comm_to_list(sleep, port, comm_id, key): time.sleep(sleep) redis_db = Redis(port=port) comm = create_comm(comm_id) buf = write_communication_to_buffer(comm) redis_db.lpush(key, buf)
def test_create_comm_ws_al_sentence(): comm = create_comm('one', '\t \t\r\n\n', annotation_level=AL_SENTENCE) assert 'one' == comm.id assert '\t \t\r\n\n' == comm.text assert [] == comm.sectionList assert validate_communication(comm)
def test_create_comm_one_sentence_al_none(): comm = create_comm('one', 'simple comm\t\t.', annotation_level=AL_NONE) assert 'one' == comm.id assert 'simple comm\t\t.' == comm.text assert comm.sectionList is None assert validate_communication(comm)
def test_create_comm_unicode_al_none(): comm = create_comm('one', u'狐狸\t\t.', annotation_level=AL_NONE) assert 'one' == comm.id assert u'狐狸\t\t.' == comm.text assert comm.sectionList is None assert validate_communication(comm)
def test_create_comm_empty_al_section(): comm = create_comm('one', annotation_level=AL_SECTION) assert 'one' == comm.id assert '' == comm.text assert [] == comm.sectionList assert validate_communication(comm)
def test_create_comm_empty(): comm = create_comm('one') assert 'one' == comm.id assert '' == comm.text assert [] == comm.sectionList assert validate_communication(comm)
def test_create_comm_ws(): comm = create_comm('one', '\t \t\r\n\n') assert 'one' == comm.id assert '\t \t\r\n\n' == comm.text assert [] == comm.sectionList assert validate_communication(comm)
def setUp(self): self.comm1 = create_comm('comm-1') self.comm2 = create_comm('comm-2') self.comm3 = create_comm('comm-3')