def test_create_comm_tarball_log_every(output_file, text_l0, text_l1): p = Popen([ 'scripts/create-comm-tarball.py', '--log-level', 'INFO', '--log-interval', '1', 'tests/testdata/les-deux-chandeliers.tar.gz', output_file ], stdout=PIPE, stderr=PIPE) (stdout, stderr) = p.communicate() assert p.returncode == 0 assert len([line for line in stderr.strip().split('\n') if 'INFO' in line]) >= 2 reader = CommunicationReader(output_file) it = iter(reader) (comm, _) = it.next() assert comm.id == 'les-deux-chandeliers/l0.txt' assert validate_communication(comm) assert comm.text == text_l0 assert comm.sectionList is None (comm, _) = it.next() assert comm.id == 'les-deux-chandeliers/l1.txt' assert validate_communication(comm) assert comm.text == text_l1 assert comm.sectionList is None try: it.next() except StopIteration: pass else: assert False
def test_compress_uuids(output_file, args): input_file = "tests/testdata/simple.tar.gz" p = Popen(["scripts/compress-uuids.py", input_file, output_file] + list(args), stdout=PIPE, stderr=PIPE) (stdout, stderr) = p.communicate() assert p.returncode == 0 reader = CommunicationReader(output_file) it = iter(reader) (comm, comm_filename) = it.next() assert comm_filename == "simple_1.concrete" assert comm.id == "one" assert validate_communication(comm) (comm, comm_filename) = it.next() assert comm_filename == "simple_2.concrete" assert comm.id == "two" assert validate_communication(comm) (comm, comm_filename) = it.next() assert comm_filename == "simple_3.concrete" assert comm.id == "three" assert validate_communication(comm) assert os.stat(output_file).st_size < os.stat(input_file).st_size try: it.next() except StopIteration: pass else: assert False
def test_compress_uuids_api(reader_kwargs, compress_kwargs): input_file = 'tests/testdata/simple.tar.gz' reader = CommunicationReader(input_file, **reader_kwargs) it = iter(reader) (comm, _) = next(it) (new_comm, uc) = compress_uuids(comm, **compress_kwargs) assert new_comm.id == 'one' assert comm.id == new_comm.id assert validate_communication(new_comm) (comm, _) = next(it) (new_comm, uc) = compress_uuids(comm, **compress_kwargs) assert new_comm.id == 'two' assert comm.id == new_comm.id assert validate_communication(new_comm) (comm, _) = next(it) (new_comm, uc) = compress_uuids(comm, **compress_kwargs) assert new_comm.id == 'three' assert comm.id == new_comm.id assert validate_communication(new_comm) try: next(it) except StopIteration: pass else: assert False
def test_compress_uuids_api(reader_kwargs, compress_kwargs): input_file = "tests/testdata/simple.tar.gz" reader = CommunicationReader(input_file, **reader_kwargs) it = iter(reader) (comm, _) = it.next() (new_comm, uc) = compress_uuids(comm, **compress_kwargs) assert new_comm.id == "one" assert comm.id == new_comm.id assert validate_communication(new_comm) (comm, _) = it.next() (new_comm, uc) = compress_uuids(comm, **compress_kwargs) assert new_comm.id == "two" assert comm.id == new_comm.id assert validate_communication(new_comm) (comm, _) = it.next() (new_comm, uc) = compress_uuids(comm, **compress_kwargs) assert new_comm.id == "three" assert comm.id == new_comm.id assert validate_communication(new_comm) try: it.next() except StopIteration: pass else: assert False
def test_create_comm_tarball(output_file, text_l0, text_l1): p = Popen([ 'scripts/create-comm-tarball.py', 'tests/testdata/les-deux-chandeliers.tar.gz', output_file ], stdout=PIPE, stderr=PIPE) (stdout, stderr) = p.communicate() assert p.returncode == 0 reader = CommunicationReader(output_file) it = iter(reader) (comm, _) = it.next() assert comm.id == 'les-deux-chandeliers/l0.txt' assert validate_communication(comm) assert comm.text == text_l0 assert comm.sectionList is None (comm, _) = it.next() assert comm.id == 'les-deux-chandeliers/l1.txt' assert validate_communication(comm) assert comm.text == text_l1 assert comm.sectionList is None try: it.next() except StopIteration: pass else: assert False
def test_create_comm_tarball(output_file, text_l0, text_l1): p = Popen([ sys.executable, 'scripts/create-comm-tarball.py', 'tests/testdata/les-deux-chandeliers.tar.gz', output_file ], stdout=PIPE, stderr=PIPE) (stdout, stderr) = p.communicate() assert p.returncode == 0 reader = CommunicationReader(output_file) it = iter(reader) (comm, _) = next(it) assert comm.id == 'les-deux-chandeliers/l0.txt' assert validate_communication(comm) assert comm.text == text_l0 assert comm.sectionList is None (comm, _) = next(it) assert comm.id == 'les-deux-chandeliers/l1.txt' assert validate_communication(comm) assert comm.text == text_l1 assert comm.sectionList is None try: next(it) except StopIteration: pass else: assert False
def test_create_comm_tarball_stdin(output_file, text_l0, text_l1): p = Popen(['scripts/create-comm-tarball.py', '-', output_file], stdin=PIPE, stdout=PIPE, stderr=PIPE) with open('tests/testdata/les-deux-chandeliers.tar.gz', 'rb') as f: (stdout, stderr) = p.communicate(f.read()) assert p.returncode == 0 reader = CommunicationReader(output_file) it = iter(reader) (comm, _) = it.next() assert comm.id == 'les-deux-chandeliers/l0.txt' assert validate_communication(comm) assert comm.text == text_l0 assert comm.sectionList is None (comm, _) = it.next() assert comm.id == 'les-deux-chandeliers/l1.txt' assert validate_communication(comm) assert comm.text == text_l1 assert comm.sectionList is None try: it.next() except StopIteration: pass else: assert False
def test_create_comm_tarball_annotated(output_file, text_l0, text_l1): p = Popen([ 'scripts/create-comm-tarball.py', '--annotation-level', 'section', 'tests/testdata/les-deux-chandeliers.tar.gz', output_file ], stdout=PIPE, stderr=PIPE) (stdout, stderr) = p.communicate() assert p.returncode == 0 reader = CommunicationReader(output_file) it = iter(reader) (comm, _) = it.next() assert comm.id == 'les-deux-chandeliers/l0.txt' assert validate_communication(comm) assert comm.text == text_l0 assert len(comm.sectionList) == 1 (comm, _) = it.next() assert comm.id == 'les-deux-chandeliers/l1.txt' assert validate_communication(comm) assert comm.text == text_l1 assert len(comm.sectionList) == 1 try: it.next() except StopIteration: pass else: assert False
def test_create_comm_complex_al_none(): comm = create_comm('one', '\n\nsimple comm\t\t.\nor ...\n\nisit?\n', annotation_level=AL_NONE) assert 'one' == comm.id assert '\n\nsimple comm\t\t.\nor ...\n\nisit?\n' == comm.text assert comm.sectionList is None assert validate_communication(comm)
def test_create_comm_stdout(output_file, text): p = Popen([ 'scripts/create-comm.py', 'tests/testdata/les-deux-chandeliers.txt', '-' ], stdout=PIPE, stderr=PIPE) (stdout, stderr) = p.communicate() assert p.returncode == 0 with open(output_file, 'wb') as f: f.write(stdout) reader = CommunicationReader(output_file) it = iter(reader) (comm, _) = it.next() assert comm.id == 'tests/testdata/les-deux-chandeliers.txt' assert validate_communication(comm) assert comm.text == text assert comm.sectionList is None try: it.next() except StopIteration: pass else: assert False
def test_create_comm_annotated(output_file, text): p = Popen([ 'scripts/create-comm.py', '--annotation-level', 'section', 'tests/testdata/les-deux-chandeliers.txt', output_file ], stdout=PIPE, stderr=PIPE) (stdout, stderr) = p.communicate() assert p.returncode == 0 reader = CommunicationReader(output_file) it = iter(reader) (comm, _) = it.next() assert comm.id == 'tests/testdata/les-deux-chandeliers.txt' assert validate_communication(comm) assert comm.text == text assert len(comm.sectionList) == 2 try: it.next() except StopIteration: pass else: assert False
def test_create_comm_complex_al_sentence(): comm = create_comm('one', '\n\nsimple comm\t\t.\nor ...\n\nisit?\n', annotation_level=AL_SENTENCE) assert 'one' == comm.id assert '\n\nsimple comm\t\t.\nor ...\n\nisit?\n' == comm.text assert 2 == len(comm.sectionList) sect = comm.sectionList[0] assert 2 == sect.textSpan.start assert 23 == sect.textSpan.ending assert 2 == len(sect.sentenceList) sent = sect.sentenceList[0] assert 2 == sent.textSpan.start assert 16 == sent.textSpan.ending assert sent.tokenization is None sent = sect.sentenceList[1] assert 17 == sent.textSpan.start assert 23 == sent.textSpan.ending assert sent.tokenization is None sect = comm.sectionList[1] assert 25 == sect.textSpan.start assert 30 == sect.textSpan.ending assert 1 == len(sect.sentenceList) sent = sect.sentenceList[0] assert 25 == sent.textSpan.start assert 30 == sent.textSpan.ending assert sent.tokenization is None assert validate_communication(comm)
def test_create_comm_stdout(output_file, text): p = Popen([ sys.executable, 'scripts/create-comm.py', 'tests/testdata/les-deux-chandeliers.txt', '-' ], stdout=PIPE, stderr=PIPE) (stdout, stderr) = p.communicate() assert p.returncode == 0 with open(output_file, 'wb') as f: f.write(stdout) reader = CommunicationReader(output_file) it = iter(reader) (comm, _) = next(it) assert comm.id == 'tests/testdata/les-deux-chandeliers.txt' assert validate_communication(comm) assert comm.text == text assert comm.sectionList is None try: next(it) except StopIteration: pass else: assert False
def json_str_to_validated_concrete_bytes(tweet_str): tweet_str = tweet_str.decode('utf-8') b = json_tweet_string_to_Communication(tweet_str, True, True) if b is None or not validate_communication(b): return None else: return write_communication_to_buffer(b)
def test_tweets2concrete_unicode(output_file): p = Popen([ 'scripts/tweets2concrete.py', 'tests/testdata/tweets.unicode.json', output_file ], stdout=PIPE, stderr=PIPE) (stdout, stderr) = p.communicate() assert p.returncode == 0 reader = CommunicationReader(output_file) it = iter(reader) (comm, _) = it.next() assert_first_comm(comm) assert validate_communication(comm) (comm, _) = it.next() assert_second_comm(comm) try: it.next() except StopIteration: pass else: assert False
def test_validate_minimal_communication_with_uuid(): comm = Communication() comm.id = "myID" comm.metadata = AnnotationMetadata(tool="TEST", timestamp=int(time.time())) comm.type = "Test Communication" comm.uuid = generate_UUID() assert validate_communication(comm)
def test_create_comm_annotated(output_file, text): p = Popen([ sys.executable, 'scripts/create-comm.py', '--annotation-level', 'section', 'tests/testdata/les-deux-chandeliers.txt', output_file ], stdout=PIPE, stderr=PIPE) (stdout, stderr) = p.communicate() assert p.returncode == 0 reader = CommunicationReader(output_file) it = iter(reader) (comm, _) = next(it) assert comm.id == 'tests/testdata/les-deux-chandeliers.txt' assert validate_communication(comm) assert comm.text == text assert len(comm.sectionList) == 2 try: next(it) except StopIteration: pass else: assert False
def test_create_comm(output_file, text): p = Popen([ 'scripts/create-comm.py', 'tests/testdata/les-deux-chandeliers.txt', output_file ], stdout=PIPE, stderr=PIPE) (stdout, stderr) = p.communicate() assert p.returncode == 0 reader = CommunicationReader(output_file) it = iter(reader) (comm, _) = it.next() assert comm.id == 'tests/testdata/les-deux-chandeliers.txt' assert validate_communication(comm) assert comm.text == text assert comm.sectionList is None try: it.next() except StopIteration: pass else: assert False
def test_zip_file_backed_comm_container_retrieve(): zipfile_path = u'tests/testdata/simple.zip' cc = ZipFileBackedCommunicationContainer(zipfile_path) assert 3 == len(cc) assert u'simple_1' in cc for comm_id in cc: comm = cc[comm_id] assert validate_communication(comm)
def test_memory_backed_comm_container_retrieve(): comm_path = u'tests/testdata/simple.tar.gz' cc = MemoryBackedCommunicationContainer(comm_path) assert 3 == len(cc) assert u'one' in cc for comm_id in cc: comm = cc[comm_id] assert validate_communication(comm)
def test_directory_backed_comm_container_retrieve(): directory_path = u'tests/testdata/a' cc = DirectoryBackedCommunicationContainer(directory_path) assert 3 == len(cc) assert u'simple_1' in cc for comm_id in cc: comm = cc[comm_id] assert validate_communication(comm)
def test_validate_minimal_communication_with_uuid(): comm = Communication() comm.id = "myID" comm.metadata = AnnotationMetadata( tool="TEST", timestamp=int(time.time())) comm.type = "Test Communication" comm.uuid = generate_UUID() assert validate_communication(comm)
def create_comm_from_tweet(json_tweet_string): """Create a Concrete Communication from a JSON Tweet string Args: json_tweet_string: A JSON string for a Tweet, using the JSON format specified by the Twitter API: https://dev.twitter.com/docs/platform-objects/tweets Returns: A Concrete Communication object """ tweet_data = json.loads(json_tweet_string) augf = AnalyticUUIDGeneratorFactory() aug = augf.create() comm = concrete.Communication() comm.id = "Annotation_Test_1" comm.metadata = concrete.AnnotationMetadata( tool="Annotation Example script", timestamp=int(time.time()) ) comm.text = tweet_data['text'] comm.type = "Tweet" comm.uuid = next(aug) comm.sectionList = [concrete.Section()] comm.sectionList[0].kind = "mySectionKind" comm.sectionList[0].uuid = next(aug) comm.sectionList[0].sentenceList = [concrete.Sentence()] comm.sectionList[0].sentenceList[0].uuid = next(aug) comm.sectionList[0].sentenceList[0].tokenization = concrete.Tokenization() tokenization = comm.sectionList[0].sentenceList[0].tokenization tokenization.kind = concrete.TokenizationKind.TOKEN_LIST tokenization.metadata = concrete.AnnotationMetadata( tool="TEST", timestamp=int(time.time())) tokenization.tokenList = concrete.TokenList() tokenization.tokenList.tokenList = [] tokenization.uuid = next(aug) # Whitespace tokenization tokens = comm.text.split() for i, token_text in enumerate(tokens): t = concrete.Token() t.tokenIndex = i t.text = token_text tokenization.tokenList.tokenList.append(t) if validate_communication(comm): print("Created valid Communication") else: print("ERROR: Invalid Communication") return comm
def create_comm_from_tweet(json_tweet_string): """Create a Concrete Communication from a JSON Tweet string Args: json_tweet_string: A JSON string for a Tweet, using the JSON format specified by the Twitter API: https://dev.twitter.com/docs/platform-objects/tweets Returns: A Concrete Communication object """ tweet_data = json.loads(json_tweet_string) augf = AnalyticUUIDGeneratorFactory() aug = augf.create() comm = concrete.Communication() comm.id = "Annotation_Test_1" comm.metadata = concrete.AnnotationMetadata( tool="Annotation Example script", timestamp=int(time.time()) ) comm.text = tweet_data['text'] comm.type = "Tweet" comm.uuid = aug.next() comm.sectionList = [concrete.Section()] comm.sectionList[0].kind = "mySectionKind" comm.sectionList[0].uuid = aug.next() comm.sectionList[0].sentenceList = [concrete.Sentence()] comm.sectionList[0].sentenceList[0].uuid = aug.next() comm.sectionList[0].sentenceList[0].tokenization = concrete.Tokenization() tokenization = comm.sectionList[0].sentenceList[0].tokenization tokenization.kind = concrete.TokenizationKind.TOKEN_LIST tokenization.metadata = concrete.AnnotationMetadata( tool="TEST", timestamp=int(time.time())) tokenization.tokenList = concrete.TokenList() tokenization.tokenList.tokenList = [] tokenization.uuid = aug.next() # Whitespace tokenization tokens = comm.text.split() for i, token_text in enumerate(tokens): t = concrete.Token() t.tokenIndex = i t.text = token_text tokenization.tokenList.tokenList.append(t) if validate_communication(comm): print "Created valid Communication" else: print "ERROR: Invalid Communication" return comm
def test_create_comm_one_sentence_al_section(): comm = create_comm('one', 'simple comm\t\t.', annotation_level=AL_SECTION) assert 'one' == comm.id assert 'simple comm\t\t.' == comm.text assert 1 == len(comm.sectionList) sect = comm.sectionList[0] assert 0 == sect.textSpan.start assert 14 == sect.textSpan.ending assert sect.sentenceList is None assert validate_communication(comm)
def test_create_comm_complex(): comm = create_comm('one', '\n\nsimple comm\t\t.\nor ...\n\nisit?\n') assert 'one' == comm.id assert '\n\nsimple comm\t\t.\nor ...\n\nisit?\n' == comm.text assert 3 == len(comm.sectionList) sect = comm.sectionList[0] assert 0 == sect.textSpan.start assert 0 == sect.textSpan.ending assert 0 == len(sect.sentenceList) sect = comm.sectionList[1] assert 2 == sect.textSpan.start assert 23 == sect.textSpan.ending assert 2 == len(sect.sentenceList) sent = sect.sentenceList[0] assert 2 == sent.textSpan.start assert 16 == sent.textSpan.ending tl = sent.tokenization.tokenList.tokenList assert 3 == len(tl) assert 0 == tl[0].tokenIndex assert 'simple' == tl[0].text assert 1 == tl[1].tokenIndex assert 'comm' == tl[1].text assert 2 == tl[2].tokenIndex assert '.' == tl[2].text sent = sect.sentenceList[1] assert 17 == sent.textSpan.start assert 23 == sent.textSpan.ending tl = sent.tokenization.tokenList.tokenList assert 2 == len(tl) assert 0 == tl[0].tokenIndex assert 'or' == tl[0].text assert 1 == tl[1].tokenIndex assert '...' == tl[1].text sect = comm.sectionList[2] assert 25 == sect.textSpan.start assert 31 == sect.textSpan.ending assert 2 == len(sect.sentenceList) sent = sect.sentenceList[0] assert 25 == sent.textSpan.start assert 30 == sent.textSpan.ending tl = sent.tokenization.tokenList.tokenList assert 1 == len(tl) assert 0 == tl[0].tokenIndex assert 'isit?' == tl[0].text sent = sect.sentenceList[1] assert 31 == sent.textSpan.start assert 31 == sent.textSpan.ending tl = sent.tokenization.tokenList.tokenList assert 0 == len(tl) assert validate_communication(comm)
def test_entity_mention_ids(self): comm = read_test_comm() self.assertTrue(validate_communication(comm)) self.assertTrue(validate_entity_mention_ids(comm)) comm.entitySetList[0].entityList[0].mentionIdList[ 0] = concrete.UUID(uuidString='BAD_ENTITY_MENTION_UUID') with LogCapture() as log_capture: self.assertFalse(validate_entity_mention_ids(comm)) log_capture.check(('root', 'ERROR', StringComparison( r'.*invalid entityMentionId.*BAD_ENTITY_MENTION_UUID')))
def test_entity_mention_tokenization(self): comm = read_test_comm() self.assertTrue(validate_communication(comm)) self.assertTrue(validate_entity_mention_ids(comm)) comm.entityMentionSetList[0].mentionList[0].tokens.tokenizationId = ( concrete.UUID(uuidString='BAD_TOKENIZATION_UUID') ) with LogCapture() as log_capture: self.assertFalse(validate_entity_mention_tokenization_ids(comm)) log_capture.check(('root', 'ERROR', StringComparison( r'.*invalid tokenizationId.*BAD_TOKENIZATION_UUID')))
def test_entity_mention_tokenization(): comm = read_test_comm() assert validate_communication(comm) assert validate_entity_mention_ids(comm) comm.entityMentionSetList[0].mentionList[0].tokens.tokenizationId = ( concrete.UUID(uuidString='BAD_TOKENIZATION_UUID')) with LogCapture() as log_capture: assert not validate_entity_mention_tokenization_ids(comm) log_capture.check( ('root', 'ERROR', StringComparison(r'.*invalid tokenizationId.*BAD_TOKENIZATION_UUID')))
def test_compress_uuids(output_file, args): input_file = 'tests/testdata/simple.tar.gz' p = Popen([ sys.executable, 'scripts/compress-uuids.py', input_file, output_file ] + list(args), stdout=PIPE, stderr=PIPE) (stdout, stderr) = p.communicate() assert p.returncode == 0 reader = CommunicationReader(output_file) it = iter(reader) (comm, comm_filename) = next(it) assert comm_filename == 'simple_1.concrete' assert comm.id == 'one' assert validate_communication(comm) (comm, comm_filename) = next(it) assert comm_filename == 'simple_2.concrete' assert comm.id == 'two' assert validate_communication(comm) (comm, comm_filename) = next(it) assert comm_filename == 'simple_3.concrete' assert comm.id == 'three' assert validate_communication(comm) assert os.stat(output_file).st_size < os.stat(input_file).st_size try: next(it) except StopIteration: pass else: assert False
def test_compress_uuids(output_file, args): input_file = 'tests/testdata/simple.tar.gz' p = Popen( [sys.executable, 'scripts/compress-uuids.py', input_file, output_file ] + list(args), stdout=PIPE, stderr=PIPE) (stdout, stderr) = p.communicate() assert p.returncode == 0 reader = CommunicationReader(output_file) it = iter(reader) (comm, comm_filename) = next(it) assert comm_filename == 'simple_1.concrete' assert comm.id == 'one' assert validate_communication(comm) (comm, comm_filename) = next(it) assert comm_filename == 'simple_2.concrete' assert comm.id == 'two' assert validate_communication(comm) (comm, comm_filename) = next(it) assert comm_filename == 'simple_3.concrete' assert comm.id == 'three' assert validate_communication(comm) assert os.stat(output_file).st_size < os.stat(input_file).st_size try: next(it) except StopIteration: pass else: assert False
def test_create_comm_unicode_al_sentence(): comm = create_comm('one', u'狐狸\t\t.', annotation_level=AL_SENTENCE) assert 'one' == comm.id assert u'狐狸\t\t.' == comm.text assert 1 == len(comm.sectionList) sect = comm.sectionList[0] assert 0 == sect.textSpan.start assert 5 == sect.textSpan.ending assert 1 == len(sect.sentenceList) sent = sect.sentenceList[0] assert 0 == sent.textSpan.start assert 5 == sent.textSpan.ending assert sent.tokenization is None assert validate_communication(comm)
def test_entity_mention_ids(): comm = read_test_comm() assert validate_communication(comm) assert validate_entity_mention_ids(comm) comm.entitySetList[0].entityList[0].mentionIdList[0] = concrete.UUID( uuidString='BAD_ENTITY_MENTION_UUID') with LogCapture() as log_capture: assert not validate_entity_mention_ids(comm) log_capture.check( ('root', 'ERROR', StringComparison( r'.*invalid entityMentionId.*BAD_ENTITY_MENTION_UUID')))
def test_create_comm_tarball_per_line(output_file, text_l0, text_l1_s0, text_l1_s1, text_l1_s2, text_l1_s3, text_l1_s4): p = Popen([ 'scripts/create-comm-tarball.py', '--per-line', 'tests/testdata/les-deux-chandeliers-perline.tar.gz', output_file ], stdout=PIPE, stderr=PIPE) (stdout, stderr) = p.communicate() assert p.returncode == 0 reader = CommunicationReader(output_file) it = iter(reader) (comm, _) = it.next() assert comm.id == 'les-deux-chandeliers-perline/l0.txt/0' assert validate_communication(comm) assert comm.text == text_l0 assert comm.sectionList is None (comm, _) = it.next() assert comm.id == 'les-deux-chandeliers-perline/l1.txt/0' assert validate_communication(comm) assert comm.text == text_l1_s0 assert comm.sectionList is None (comm, _) = it.next() assert comm.id == 'les-deux-chandeliers-perline/l1.txt/1' assert validate_communication(comm) assert comm.text == text_l1_s1 assert comm.sectionList is None (comm, _) = it.next() assert comm.id == 'les-deux-chandeliers-perline/l1.txt/2' assert validate_communication(comm) assert comm.text == text_l1_s2 assert comm.sectionList is None (comm, _) = it.next() assert comm.id == 'les-deux-chandeliers-perline/l1.txt/3' assert validate_communication(comm) assert comm.text == text_l1_s3 assert comm.sectionList is None (comm, _) = it.next() assert comm.id == 'les-deux-chandeliers-perline/l1.txt/4' assert validate_communication(comm) assert comm.text == text_l1_s4 assert comm.sectionList is None try: it.next() except StopIteration: pass else: assert False
def add_dictionary_tagging(comm): """Adds In/Out of dictionary 'POS' tags to a Communication Takes a Concrete Communication, adds a Part-Of-Speech tag to each token, where the tags record whether the token is 'In' or 'Out' of the system dictionary. Args: comm: A Concrete Communication with tokens Returns: A copy of the original Communication, with POS tags added """ dictionary = set() for w in open('/usr/share/dict/words'): dictionary.add(w.strip().lower()) augf = AnalyticUUIDGeneratorFactory(comm) aug = augf.create() if comm.sectionList: for section in comm.sectionList: if section.sentenceList: for sentence in section.sentenceList: posTagList = concrete.TokenTagging() posTagList.metadata = concrete.AnnotationMetadata( tool="POS Tagger", timestamp=int(time.time())) posTagList.taggingType = "POS" posTagList.taggedTokenList = [] posTagList.uuid = aug.next() tkzn = sentence.tokenization if tkzn.tokenList: for i, token in enumerate(tkzn.tokenList.tokenList): tt = concrete.TaggedToken() tt.tokenIndex = i if token.text.lower() in dictionary: tt.tag = "In" else: tt.tag = "Out" posTagList.taggedTokenList.append(tt) print "%d [%s] %s" % (i, token.text, tt.tag) tkzn.tokenTaggingList = [posTagList] print if validate_communication(comm): print "Created valid POS tagging for Communication" else: print "ERROR: Invalid POS tagging Communication" return comm
def add_dictionary_tagging(comm): """Adds In/Out of dictionary 'POS' tags to a Communication Takes a Concrete Communication, adds a Part-Of-Speech tag to each token, where the tags record whether the token is 'In' or 'Out' of the system dictionary. Args: comm: A Concrete Communication with tokens Returns: A copy of the original Communication, with POS tags added """ dictionary = set() for w in open('/usr/share/dict/words'): dictionary.add(w.strip().lower()) augf = AnalyticUUIDGeneratorFactory(comm) aug = augf.create() if comm.sectionList: for section in comm.sectionList: if section.sentenceList: for sentence in section.sentenceList: posTagList = concrete.TokenTagging() posTagList.metadata = concrete.AnnotationMetadata( tool="POS Tagger", timestamp=int(time.time())) posTagList.taggingType = "POS" posTagList.taggedTokenList = [] posTagList.uuid = next(aug) tkzn = sentence.tokenization if tkzn.tokenList: for i, token in enumerate(tkzn.tokenList.tokenList): tt = concrete.TaggedToken() tt.tokenIndex = i if token.text.lower() in dictionary: tt.tag = "In" else: tt.tag = "Out" posTagList.taggedTokenList.append(tt) print("%d [%s] %s" % (i, token.text, tt.tag)) tkzn.tokenTaggingList = [posTagList] print() if validate_communication(comm): print("Created valid POS tagging for Communication") else: print("ERROR: Invalid POS tagging Communication") return comm
def test_fetch_backed_container(): comm_container = {'one': create_comm('one'), 'two': create_comm('two')} impl = CommunicationContainerFetchHandler(comm_container) host = 'localhost' port = find_port() with SubprocessFetchCommunicationServiceWrapper(impl, host, port): cc = FetchBackedCommunicationContainer(host, port) assert len(cc) == 2 assert 'one' in cc assert 'two' in cc for comm_id in cc: comm = cc[comm_id] assert validate_communication(comm)
def test_fetch_backed_container(): comm_container = { 'one': create_comm('one'), 'two': create_comm('two') } impl = CommunicationContainerFetchHandler(comm_container) host = 'localhost' port = find_port() with SubprocessFetchCommunicationServiceWrapper(impl, host, port): cc = FetchBackedCommunicationContainer(host, port) assert len(cc) == 2 assert 'one' in cc assert 'two' in cc for comm_id in cc: comm = cc[comm_id] assert validate_communication(comm)
def test_create_comm_unicode(): comm = create_comm('one', u'狐狸\t\t.') assert 'one' == comm.id assert u'狐狸\t\t.' == comm.text assert 1 == len(comm.sectionList) sect = comm.sectionList[0] assert 0 == sect.textSpan.start assert 5 == sect.textSpan.ending assert 1 == len(sect.sentenceList) sent = sect.sentenceList[0] assert 0 == sent.textSpan.start assert 5 == sent.textSpan.ending tl = sent.tokenization.tokenList.tokenList assert 2 == len(tl) assert 0 == tl[0].tokenIndex assert u'狐狸' == tl[0].text assert 1 == tl[1].tokenIndex assert '.' == tl[1].text assert validate_communication(comm)
def test_create_comm_one_sentence(): comm = create_comm('one', 'simple comm\t\t.') assert 'one' == comm.id assert 'simple comm\t\t.' == comm.text assert 1 == len(comm.sectionList) sect = comm.sectionList[0] assert 0 == sect.textSpan.start assert 14 == sect.textSpan.ending assert 1 == len(sect.sentenceList) sent = sect.sentenceList[0] assert 0 == sent.textSpan.start assert 14 == sent.textSpan.ending tl = sent.tokenization.tokenList.tokenList assert 3 == len(tl) assert 0 == tl[0].tokenIndex assert 'simple' == tl[0].text assert 1 == tl[1].tokenIndex assert 'comm' == tl[1].text assert 2 == tl[2].tokenIndex assert '.' == tl[2].text assert validate_communication(comm)
def assert_first_comm(comm): assert comm.id == '238426131689242624' assert comm.startTime == 1345680194 assert comm.endTime == 1345680194 assert validate_communication(comm)
def test_create_simple_comm(): comm = create_simple_comm('one') assert 'one' == comm.id assert 'Super simple sentence .' == comm.text assert validate_communication(comm)
def test_create_comm_ws_al_sentence(): comm = create_comm('one', '\t \t\r\n\n', annotation_level=AL_SENTENCE) assert 'one' == comm.id assert '\t \t\r\n\n' == comm.text assert [] == comm.sectionList assert validate_communication(comm)
def assert_second_comm(comm): assert comm.id == '238426131689242625' assert comm.startTime == 1345680195 assert comm.endTime == 1345680195 assert validate_communication(comm)
def test_create_comm_ws(): comm = create_comm('one', '\t \t\r\n\n') assert 'one' == comm.id assert '\t \t\r\n\n' == comm.text assert [] == comm.sectionList assert validate_communication(comm)