def test_x_prefix_bad_comm_uuid(self): u = '7575a428a-aaf7-4c2e-929e-1e2a0ab59e16' comm = Duck() comm.uuid = Duck() comm.uuid.uuidString = u augf = AnalyticUUIDGeneratorFactory(comm) with self.assertRaises(ValueError): augf.create()
def test_x_prefix_no_comm(self): n = 1000 augf = AnalyticUUIDGeneratorFactory() u = augf.comm_uuid for i in xrange(n): aug = augf.create() self.assertTrue(aug.next().uuidString.startswith(u[:8 + 1 + 4]))
def create_comm_from_tweet(json_tweet_string): """Create a Concrete Communication from a JSON Tweet string Args: json_tweet_string: A JSON string for a Tweet, using the JSON format specified by the Twitter API: https://dev.twitter.com/docs/platform-objects/tweets Returns: A Concrete Communication object """ tweet_data = json.loads(json_tweet_string) augf = AnalyticUUIDGeneratorFactory() aug = augf.create() comm = concrete.Communication() comm.id = "Annotation_Test_1" comm.metadata = concrete.AnnotationMetadata( tool="Annotation Example script", timestamp=int(time.time()) ) comm.text = tweet_data['text'] comm.type = "Tweet" comm.uuid = aug.next() comm.sectionList = [concrete.Section()] comm.sectionList[0].kind = "mySectionKind" comm.sectionList[0].uuid = aug.next() comm.sectionList[0].sentenceList = [concrete.Sentence()] comm.sectionList[0].sentenceList[0].uuid = aug.next() comm.sectionList[0].sentenceList[0].tokenization = concrete.Tokenization() tokenization = comm.sectionList[0].sentenceList[0].tokenization tokenization.kind = concrete.TokenizationKind.TOKEN_LIST tokenization.metadata = concrete.AnnotationMetadata( tool="TEST", timestamp=int(time.time())) tokenization.tokenList = concrete.TokenList() tokenization.tokenList.tokenList = [] tokenization.uuid = aug.next() # Whitespace tokenization tokens = comm.text.split() for i, token_text in enumerate(tokens): t = concrete.Token() t.tokenIndex = i t.text = token_text tokenization.tokenList.tokenList.append(t) if validate_communication(comm): print "Created valid Communication" else: print "ERROR: Invalid Communication" return comm
def create_comm_from_tweet(json_tweet_string): """Create a Concrete Communication from a JSON Tweet string Args: json_tweet_string: A JSON string for a Tweet, using the JSON format specified by the Twitter API: https://dev.twitter.com/docs/platform-objects/tweets Returns: A Concrete Communication object """ tweet_data = json.loads(json_tweet_string) augf = AnalyticUUIDGeneratorFactory() aug = augf.create() comm = concrete.Communication() comm.id = "Annotation_Test_1" comm.metadata = concrete.AnnotationMetadata( tool="Annotation Example script", timestamp=int(time.time()) ) comm.text = tweet_data['text'] comm.type = "Tweet" comm.uuid = next(aug) comm.sectionList = [concrete.Section()] comm.sectionList[0].kind = "mySectionKind" comm.sectionList[0].uuid = next(aug) comm.sectionList[0].sentenceList = [concrete.Sentence()] comm.sectionList[0].sentenceList[0].uuid = next(aug) comm.sectionList[0].sentenceList[0].tokenization = concrete.Tokenization() tokenization = comm.sectionList[0].sentenceList[0].tokenization tokenization.kind = concrete.TokenizationKind.TOKEN_LIST tokenization.metadata = concrete.AnnotationMetadata( tool="TEST", timestamp=int(time.time())) tokenization.tokenList = concrete.TokenList() tokenization.tokenList.tokenList = [] tokenization.uuid = next(aug) # Whitespace tokenization tokens = comm.text.split() for i, token_text in enumerate(tokens): t = concrete.Token() t.tokenIndex = i t.text = token_text tokenization.tokenList.tokenList.append(t) if validate_communication(comm): print("Created valid Communication") else: print("ERROR: Invalid Communication") return comm
def test_y_prefix(self): m = 100 n = 100 augf = AnalyticUUIDGeneratorFactory() for i in xrange(m): aug = augf.create() uu = aug.next().uuidString for j in xrange(n - 1): self.assertTrue(aug.next().uuidString.startswith( uu[:8 + 1 + 4 + 1 + 4 + 1 + 4]))
def test_y_prefix_spread(self): m = 10 augf = AnalyticUUIDGeneratorFactory() # union bound: (1/16)^8 * m^2 = 2e-8 s = set() for i in xrange(m): aug = augf.create() u = aug.next().uuidString s.add(u[:8 + 1 + 4 + 1 + 4 + 1 + 4]) self.assertEquals(len(s), m)
def test_x_prefix_with_comm(self): n = 1000 u = '7575a428-aaf7-4c2e-929e-1e2a0ab59e16' comm = Duck() comm.uuid = Duck() comm.uuid.uuidString = u augf = AnalyticUUIDGeneratorFactory(comm) self.assertEquals(augf.comm_uuid, u) for i in xrange(n): aug = augf.create() self.assertTrue(aug.next().uuidString.startswith(u[:8 + 1 + 4]))
def create_simple_comm(comm_id, sentence_string="Super simple sentence ."): """Create a simple (valid) Communication suitable for testing purposes The Communication will have a single Section containing a single Sentence. Args: - `comm_id`: A string specifying a Communication ID - `sentence_string`: A string to be used for the sentence text. The string will be whitespace-tokenized. Returns: - A Concrete Communication object """ logging.warning('create_simple_comm will be removed in a future' ' release, please use create_comm instead') toolname = "TEST" timestamp = int(time.time()) augf = AnalyticUUIDGeneratorFactory() aug = augf.create() comm = Communication(id=comm_id, metadata=AnnotationMetadata(tool=toolname, timestamp=timestamp), type=toolname, uuid=aug.next()) tokenization = Tokenization(kind=TokenizationKind.TOKEN_LIST, metadata=AnnotationMetadata( tool=toolname, timestamp=timestamp), tokenList=TokenList(tokenList=[]), uuid=aug.next()) token_string_list = sentence_string.split() for i, token_string in enumerate(token_string_list): tokenization.tokenList.tokenList.append( Token(text=token_string, tokenIndex=i)) sentence = Sentence(textSpan=TextSpan(0, len(sentence_string)), tokenization=tokenization, uuid=aug.next()) section = Section(kind="SectionKind", sentenceList=[sentence], textSpan=TextSpan(0, len(sentence_string)), uuid=aug.next()) comm.sectionList = [section] comm.text = sentence_string return comm
def test_z_increment(self): m = 100 n = 100 augf = AnalyticUUIDGeneratorFactory() for i in xrange(m): aug = augf.create() u = aug.next().uuidString z = int(u[8 + 1 + 4 + 1 + 4 + 1 + 4 + 1:], 16) for j in xrange(n - 1): u = aug.next().uuidString self.assertEquals(int(u[8 + 1 + 4 + 1 + 4 + 1 + 4 + 1:], 16), (z + 1) % 2**48) z = (z + 1) % 2**48
def test_spread(self): m = 100 n = 100 augf = AnalyticUUIDGeneratorFactory() # union bound: (2m-1)*(1/16)^12 * n^2 = 7e-9 s = set() for i in xrange(m): aug = augf.create() u = aug.next().uuidString s.add(u) for j in xrange(n - 1): u = aug.next().uuidString s.add(u) self.assertEquals(len(s), m * n)
def add_dictionary_tagging(comm): """Adds In/Out of dictionary 'POS' tags to a Communication Takes a Concrete Communication, adds a Part-Of-Speech tag to each token, where the tags record whether the token is 'In' or 'Out' of the system dictionary. Args: comm: A Concrete Communication with tokens Returns: A copy of the original Communication, with POS tags added """ dictionary = set() for w in open('/usr/share/dict/words'): dictionary.add(w.strip().lower()) augf = AnalyticUUIDGeneratorFactory(comm) aug = augf.create() if comm.sectionList: for section in comm.sectionList: if section.sentenceList: for sentence in section.sentenceList: posTagList = concrete.TokenTagging() posTagList.metadata = concrete.AnnotationMetadata( tool="POS Tagger", timestamp=int(time.time())) posTagList.taggingType = "POS" posTagList.taggedTokenList = [] posTagList.uuid = aug.next() tkzn = sentence.tokenization if tkzn.tokenList: for i, token in enumerate(tkzn.tokenList.tokenList): tt = concrete.TaggedToken() tt.tokenIndex = i if token.text.lower() in dictionary: tt.tag = "In" else: tt.tag = "Out" posTagList.taggedTokenList.append(tt) print "%d [%s] %s" % (i, token.text, tt.tag) tkzn.tokenTaggingList = [posTagList] print if validate_communication(comm): print "Created valid POS tagging for Communication" else: print "ERROR: Invalid POS tagging Communication" return comm
def add_dictionary_tagging(comm): """Adds In/Out of dictionary 'POS' tags to a Communication Takes a Concrete Communication, adds a Part-Of-Speech tag to each token, where the tags record whether the token is 'In' or 'Out' of the system dictionary. Args: comm: A Concrete Communication with tokens Returns: A copy of the original Communication, with POS tags added """ dictionary = set() for w in open('/usr/share/dict/words'): dictionary.add(w.strip().lower()) augf = AnalyticUUIDGeneratorFactory(comm) aug = augf.create() if comm.sectionList: for section in comm.sectionList: if section.sentenceList: for sentence in section.sentenceList: posTagList = concrete.TokenTagging() posTagList.metadata = concrete.AnnotationMetadata( tool="POS Tagger", timestamp=int(time.time())) posTagList.taggingType = "POS" posTagList.taggedTokenList = [] posTagList.uuid = next(aug) tkzn = sentence.tokenization if tkzn.tokenList: for i, token in enumerate(tkzn.tokenList.tokenList): tt = concrete.TaggedToken() tt.tokenIndex = i if token.text.lower() in dictionary: tt.tag = "In" else: tt.tag = "Out" posTagList.taggedTokenList.append(tt) print("%d [%s] %s" % (i, token.text, tt.tag)) tkzn.tokenTaggingList = [posTagList] print() if validate_communication(comm): print("Created valid POS tagging for Communication") else: print("ERROR: Invalid POS tagging Communication") return comm
def annotate(self, communication): text = "" for section in communication.sectionList: if section.kind == "content": text += communication.text[section.textSpan.start:section.textSpan.ending] scores = {languages.get(iso639_1_code=k).iso639_3_code : math.exp(v) for k, v in self.classifier.classify(text).iteritems()} logging.info(str(scores)) augf = AnalyticUUIDGeneratorFactory(communication) aug = augf.create() lid = LanguageIdentification(uuid=aug.next(), languageToProbabilityMap=scores, metadata=AnnotationMetadata(tool="valid", timestamp=int(time.time()), kBest=len(scores)), ) communication.lidList.append(lid) return communication
def test_z_increment(self): m = 100 n = 100 augf = AnalyticUUIDGeneratorFactory() for i in xrange(m): aug = augf.create() u = aug.next().uuidString z = int(u[8 + 1 + 4 + 1 + 4 + 1 + 4 + 1:], 16) for j in xrange(n - 1): u = aug.next().uuidString self.assertEquals( int(u[8 + 1 + 4 + 1 + 4 + 1 + 4 + 1:], 16), (z + 1) % 2**48 ) z = (z + 1) % 2**48
def annotate(self, communication): augf = AnalyticUUIDGeneratorFactory(communication) aug = augf.create() for section in communication.sectionList: text = communication.text[section.textSpan.start:section.textSpan.ending] current_offset = section.textSpan.start for sent in nltk.sent_tokenize(text): logging.info("Found sentence %s", sent) initial = text.find(sent) s = Sentence(uuid=aug.next(), textSpan=TextSpan(start=current_offset + initial, ending=current_offset + initial + len(sent))) section.sentenceList.append(s) current_offset = current_offset + initial + len(sent) text = communication.text[current_offset:] return communication
def annotate(self, communication): text = communication.text augf = AnalyticUUIDGeneratorFactory(communication) aug = augf.create() entities = {} for section in communication.sectionList: for sentence in section.sentenceList: tokens = [ x.text for x in sentence.tokenization.tokenList.tokenList ] tags = [ x.tag for x in sentence.tokenization.tokenTaggingList[-1].taggedTokenList ] for subtree in nltk.ne_chunk(zip(tokens, tags)).subtrees(): if subtree.label() != "S": name = " ".join([x[0] for x in subtree.leaves()]) logging.info("Found named entity \"%s\"", name) entities[(name, subtree.label( ))] = entities.get(name, []) + [ EntityMention( uuid=aug.next(), entityType=subtree.label(), tokens=TokenRefSequence( tokenIndexList=[], tokenizationId=sentence.tokenization.uuid)) ] communication.entitySetList.append( EntitySet(uuid=aug.next(), metadata=AnnotationMetadata(timestamp=int(time.time()), tool="nltk"), entityList=[ Entity(uuid=aug.next(), mentionIdList=[x.uuid for x in v], canonicalName=k[0], type=k[1]) for k, v in entities.iteritems() ])) communication.entityMentionSetList.append( EntityMentionSet(uuid=aug.next(), metadata=AnnotationMetadata(timestamp=int( time.time()), tool="nltk"), mentionList=sum(entities.values(), []))) return communication
def annotate(self, communication): print communication.id augf = AnalyticUUIDGeneratorFactory(communication) aug = augf.create() for section in communication.sectionList: for sentence in section.sentenceList: text = communication.text[sentence.textSpan.start:sentence.textSpan.ending] sentence.tokenization = Tokenization(uuid = aug.next(), kind = TokenizationKind.TOKEN_LIST, tokenList = TokenList(tokenList=[]), tokenTaggingList = [], metadata = AnnotationMetadata(timestamp=int(time.time()), tool="nltk")) for i, token in enumerate(nltk.word_tokenize(text)): logging.info("Found token %s", token) sentence.tokenization.tokenList.tokenList.append(Token(tokenIndex=i, text=token)) return communication
def create_comm(comm_id, text='', comm_type='article', section_kind='passage', metadata_tool='concrete-python', metadata_timestamp=None, annotation_level=AL_TOKEN): ''' Create a simple, valid Communication from text. By default the text will be split by double-newlines into sections and then by single newlines into sentences within those sections. annotation_level controls the amount of annotation that is added: AL_NONE add no optional annotations (not even sections) AL_SECTION add sections but not sentences AL_SENTENCE add sentences but not tokens AL_TOKEN add all annotations, up to tokens (the default) If metadata_timestamp is None, the current time will be used. ''' if metadata_timestamp is None: metadata_timestamp = int(time.time()) augf = AnalyticUUIDGeneratorFactory() aug = augf.create() sections = (annotation_level is not None) and (annotation_level != AL_NONE) return Communication( id=comm_id, uuid=aug.next(), type=comm_type, text=text, metadata=AnnotationMetadata( tool=metadata_tool, timestamp=metadata_timestamp, ), sectionList=( [ create_section(sec_text, sec_start, sec_end, section_kind, aug, metadata_tool, metadata_timestamp, annotation_level) for (sec_text, sec_start, sec_end) in _split(text, '\n\n') ] if text.strip() else [] ) if sections else None, )
def create_comm(comm_id, text='', comm_type='article', section_kind='passage', metadata_tool='concrete-python', metadata_timestamp=None, annotation_level=AL_TOKEN): ''' Create a simple, valid Communication from text. By default the text will be split by double-newlines into sections and then by single newlines into sentences within those sections. annotation_level controls the amount of annotation that is added: AL_NONE add no optional annotations (not even sections) AL_SECTION add sections but not sentences AL_SENTENCE add sentences but not tokens AL_TOKEN add all annotations, up to tokens (the default) If metadata_timestamp is None, the current time will be used. ''' if metadata_timestamp is None: metadata_timestamp = int(time.time()) augf = AnalyticUUIDGeneratorFactory() aug = augf.create() sections = (annotation_level is not None) and (annotation_level != AL_NONE) return Communication( id=comm_id, uuid=aug.next(), type=comm_type, text=text, metadata=AnnotationMetadata( tool=metadata_tool, timestamp=metadata_timestamp, ), sectionList=([ create_section(sec_text, sec_start, sec_end, section_kind, aug, metadata_tool, metadata_timestamp, annotation_level) for (sec_text, sec_start, sec_end) in _split(text, '\n\n') ] if text.strip() else []) if sections else None, )
def annotate(self, communication): text = communication.text augf = AnalyticUUIDGeneratorFactory(communication) aug = augf.create() for section in communication.sectionList: for sentence in section.sentenceList: tokens = [ x.text for x in sentence.tokenization.tokenList.tokenList ] sentence.tokenization.tokenTaggingList.append( TokenTagging(uuid=aug.next(), metadata=AnnotationMetadata(timestamp=int( time.time()), tool="nltk"), taggedTokenList=[], taggingType="Penn Treebank")) for i, (tok, tag) in enumerate(nltk.pos_tag(tokens)): logging.info("Tagged %s as %s", tok, tag) sentence.tokenization.tokenTaggingList[ -1].taggedTokenList.append( TaggedToken(tokenIndex=i, tag=tag)) return communication
def index(): text = request.forms.get('text') transport = TTransport.TFramedTransport( TSocket.TSocket(options.annotator_host, options.annotator_port)) protocol = TCompactProtocol.TCompactProtocol(transport) client = Annotator.Client(protocol) transport.open() augf = AnalyticUUIDGeneratorFactory() aug = augf.create() c = Communication( id="", text=text, uuid=aug.next(), type="user-supplied input", metadata=AnnotationMetadata(timestamp=int(time.time()), tool="stdin"), sectionList=[ Section(uuid=aug.next(), sentenceList=[], kind="paragraph", textSpan=TextSpan(start=0, ending=len(text))) ], entitySetList=[], entityMentionSetList=[], ) new_c = client.annotate(c) form = '''<form action="/" method="post"> Enter or paste some text: <input name="text" type="text" /> <input value="Submit" type="submit" /> </form> ''' return form + "\n".join(["<h3>%s</h3>" % text] + [ "\n".join([ "<br>%s %s" % (e.type, e.canonicalName) for e in es.entityList ]) for es in new_c.entitySetList ])
def json_tweet_object_to_Communication(tweet): """ """ tweet_info = json_tweet_object_to_TweetInfo(tweet) augf = AnalyticUUIDGeneratorFactory() aug = augf.create() if 'id_str' in tweet: tweet_id = tweet['id_str'] else: logging.warning('Tweet has no id_str, leaving communication id blank') tweet_id = None tweet_time = datetime_to_timestamp(datetime.strptime(tweet_info.createdAt, CREATED_AT_FORMAT)) comm = Communication( communicationMetadata=CommunicationMetadata( tweetInfo=tweet_info), metadata=AnnotationMetadata( tool=TOOL_NAME, timestamp=int(time.time())), originalText=tweet_info.text, text=tweet_info.text, type=TWEET_TYPE, uuid=aug.next(), startTime=tweet_time, endTime=tweet_time, id=tweet_id ) # either this, or pass in gen as parameter to fx # latter is more annoying to test but slightly cleaner if tweet_info.lid is not None: tweet_info.lid.uuid = aug.next() lidList = [tweet_info.lid] comm.lidList = lidList return comm
def json_tweet_object_to_Communication(tweet): """ """ tweet_info = json_tweet_object_to_TweetInfo(tweet) augf = AnalyticUUIDGeneratorFactory() aug = augf.create() if 'id_str' in tweet: tweet_id = tweet['id_str'] else: logging.warning('Tweet has no id_str, leaving communication id blank') tweet_id = None tweet_time = unix_time(datetime.strptime(tweet_info.createdAt, CREATED_AT_FORMAT)) comm = Communication( communicationMetadata=CommunicationMetadata( tweetInfo=tweet_info), metadata=AnnotationMetadata( tool=TOOL_NAME, timestamp=int(time.time())), originalText=tweet_info.text, text=tweet_info.text, type=TWEET_TYPE, uuid=aug.next(), startTime=tweet_time, endTime=tweet_time, id=tweet_id ) # either this, or pass in gen as parameter to fx # latter is more annoying to test but slightly cleaner if tweet_info.lid is not None: tweet_info.lid.uuid = aug.next() lidList = [tweet_info.lid] comm.lidList = lidList return comm
parser.add_argument("-p", "--port", dest="port", type=int, default=9090) parser.add_argument("-H", "--host", dest="host", default="localhost") options = parser.parse_args() # Make socket transport = TSocket.TSocket(options.host, options.port) # Buffering is critical. Raw sockets are very slow transport = TTransport.TBufferedTransport(transport) # Wrap in a protocol protocol = TCompactProtocol.TCompactProtocol(transport) # Create a client to use the protocol encoder client = Annotator.Client(protocol) # Connect! transport.open() while True: s = raw_input("Write some text > ") if re.match(r"^\s*$", s): break else: augf = AnalyticUUIDGeneratorFactory() aug = augf.create() c = Communication(id="", text=s, uuid=aug.next(), type="tweet", metadata=AnnotationMetadata(timestamp=0, tool="stdin"), lidList=[]) new_c = client.annotate(c) print new_c
parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", dest="input", help="Input file where each line is \"ID<tab>TAG<tab>TEXT\"") parser.add_argument("-o", "--output", dest="output", help="Tar file to write Communications to") parser.add_argument("-t", "--tag_type", dest="tag_type", default=None, help="Type of tag (e.g. \"language\"): defaults to None, in which case the tag column is ignored (but must still be present!)") options = parser.parse_args() ugf = AnalyticUUIDGeneratorFactory() ofd = CommunicationWriterTGZ(options.output) with reader(gzip.open(options.input)) as ifd: for i, line in enumerate(ifd): toks = line.strip().split("\t") if len(toks) != 3: continue cid, label, text = toks g = ugf.create() t = int(time()) comm = Communication(id=cid, uuid=g.next(), type="Text document", text=text, communicationTaggingList=[CommunicationTagging(uuid=g.next(), metadata=AnnotationMetadata(tool="Gold labeling", timestamp=t, kBest=1, ), taggingType=options.tag_type, tagList=[label], confidenceList=[1.0], )], metadata=AnnotationMetadata(tool="text_to_concrete.py ingester", timestamp=t, kBest=1),
def create_simple_comm(comm_id, sentence_string="Super simple sentence ."): """Create a simple (valid) Communication suitable for testing purposes The Communication will have a single Section containing a single Sentence. Args: - `comm_id`: A string specifying a Communication ID - `sentence_string`: A string to be used for the sentence text. The string will be whitespace-tokenized. Returns: - A Concrete Communication object """ logging.warning('create_simple_comm will be removed in a future' ' release, please use create_comm instead') toolname = "TEST" timestamp = int(time.time()) augf = AnalyticUUIDGeneratorFactory() aug = augf.create() comm = Communication( id=comm_id, metadata=AnnotationMetadata(tool=toolname, timestamp=timestamp), type=toolname, uuid=aug.next() ) tokenization = Tokenization( kind=TokenizationKind.TOKEN_LIST, metadata=AnnotationMetadata(tool=toolname, timestamp=timestamp), tokenList=TokenList( tokenList=[]), uuid=aug.next() ) token_string_list = sentence_string.split() for i, token_string in enumerate(token_string_list): tokenization.tokenList.tokenList.append(Token(text=token_string, tokenIndex=i)) sentence = Sentence( textSpan=TextSpan(0, len(sentence_string)), tokenization=tokenization, uuid=aug.next() ) section = Section( kind="SectionKind", sentenceList=[sentence], textSpan=TextSpan(0, len(sentence_string)), uuid=aug.next() ) comm.sectionList = [section] comm.text = sentence_string return comm