def create_comm_from_tweet(json_tweet_string): """Create a Concrete Communication from a JSON Tweet string Args: json_tweet_string: A JSON string for a Tweet, using the JSON format specified by the Twitter API: https://dev.twitter.com/docs/platform-objects/tweets Returns: A Concrete Communication object """ tweet_data = json.loads(json_tweet_string) augf = AnalyticUUIDGeneratorFactory() aug = augf.create() comm = concrete.Communication() comm.id = "Annotation_Test_1" comm.metadata = concrete.AnnotationMetadata( tool="Annotation Example script", timestamp=int(time.time()) ) comm.text = tweet_data['text'] comm.type = "Tweet" comm.uuid = next(aug) comm.sectionList = [concrete.Section()] comm.sectionList[0].kind = "mySectionKind" comm.sectionList[0].uuid = next(aug) comm.sectionList[0].sentenceList = [concrete.Sentence()] comm.sectionList[0].sentenceList[0].uuid = next(aug) comm.sectionList[0].sentenceList[0].tokenization = concrete.Tokenization() tokenization = comm.sectionList[0].sentenceList[0].tokenization tokenization.kind = concrete.TokenizationKind.TOKEN_LIST tokenization.metadata = concrete.AnnotationMetadata( tool="TEST", timestamp=int(time.time())) tokenization.tokenList = concrete.TokenList() tokenization.tokenList.tokenList = [] tokenization.uuid = next(aug) # Whitespace tokenization tokens = comm.text.split() for i, token_text in enumerate(tokens): t = concrete.Token() t.tokenIndex = i t.text = token_text tokenization.tokenList.tokenList.append(t) if validate_communication(comm): print("Created valid Communication") else: print("ERROR: Invalid Communication") return comm
def test_check_required_fields(): # When a field is marked as required in a .thrift file, the # Python code generated by the Thrift compiler only seems to # capture this requirement in the validate() function for the # generated class. While the ThriftGeneratedClass.thrift_spec # structure captures the names and types of the fields, # thrift_spec does not seem to store any flags indicating # whether or not a field is required. # # Here is the validate() function for the Communication class: # # def validate(): # if self.id is None: # raise TProtocol.TProtocolException( # message='Required field id is unset!') # if self.uuid is None: # raise TProtocol.TProtocolException( # message='Required field uuid is unset!') # if self.type is None: # raise TProtocol.TProtocolException( # message='Required field type is unset!') # return # # The validate() function raises an exception when it can't # find a required field. There doesn't seem to be any way to # determine whether multiple required fields are missing, # aside from assigning a value to the required field and # running validate() again. comm = concrete.Communication() with LogCapture() as log_capture: assert not validate_thrift_deep(comm) log_capture.check( ('root', 'ERROR', "Communication: Required Field 'id' is unset!")) comm.id = "ID" with LogCapture() as log_capture: assert not validate_thrift_deep(comm) log_capture.check( ('root', 'ERROR', "Communication: Required Field 'uuid' is unset!")) comm.uuid = concrete.UUID(uuidString="TEST_UUID") with LogCapture() as log_capture: assert not validate_thrift_deep(comm) log_capture.check( ('root', 'ERROR', StringComparison(r".*TEST_UUID.*Required Field 'type' is unset!"))) comm.metadata = concrete.AnnotationMetadata(tool="TEST", timestamp=int(time.time())) comm.type = "OTHER" assert validate_thrift_deep(comm)
def add_dictionary_tagging(comm): """Adds In/Out of dictionary 'POS' tags to a Communication Takes a Concrete Communication, adds a Part-Of-Speech tag to each token, where the tags record whether the token is 'In' or 'Out' of the system dictionary. Args: comm: A Concrete Communication with tokens Returns: A copy of the original Communication, with POS tags added """ dictionary = set() for w in open('/usr/share/dict/words'): dictionary.add(w.strip().lower()) augf = AnalyticUUIDGeneratorFactory(comm) aug = augf.create() if comm.sectionList: for section in comm.sectionList: if section.sentenceList: for sentence in section.sentenceList: posTagList = concrete.TokenTagging() posTagList.metadata = concrete.AnnotationMetadata( tool="POS Tagger", timestamp=int(time.time())) posTagList.taggingType = "POS" posTagList.taggedTokenList = [] posTagList.uuid = next(aug) tkzn = sentence.tokenization if tkzn.tokenList: for i, token in enumerate(tkzn.tokenList.tokenList): tt = concrete.TaggedToken() tt.tokenIndex = i if token.text.lower() in dictionary: tt.tag = "In" else: tt.tag = "Out" posTagList.taggedTokenList.append(tt) print("%d [%s] %s" % (i, token.text, tt.tag)) tkzn.tokenTaggingList = [posTagList] print() if validate_communication(comm): print("Created valid POS tagging for Communication") else: print("ERROR: Invalid POS tagging Communication") return comm
def add_chunks_to_comm(comm, chunklink, fail_on_error): '''Converts the first constituency tree of each tokenization to chunks and adds them as a TokenTagging to the communication. comm - Communication to be annotated. chunklink - Path to the modified chunklink perl script. ''' num_sents = 0 num_chunked = 0 try: for tokenization in get_tokenizations(comm): num_sents += 1 try: if tokenization.parseList and len(tokenization.parseList) > 0: parse = tokenization.parseList[0] # Convert concrete Parse to a PTB style parse string to use as stdin for chunklink. ptb_str = '( ' + penn_treebank_for_parse(parse) + ' )\n' ptb_str = ptb_str.encode('ascii', 'replace') logging.debug("PTB string: " + ptb_str) # Run the chunklink script and capture the output. try: # We expect the chunklink script to be a modified version which can read a tree from stdin. p = subprocess.Popen(['perl', chunklink], stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) stdouterr = p.communicate(input=ptb_str) chunk_str = stdouterr[0] chunk_err = stdouterr[1] logging.debug("Chunklink stdout:\n" + chunk_str) logging.debug("Chunklink stderr:\n" + chunk_err) chunk_tags = get_chunks(chunk_str) logging.debug("Chunk tags: " + str(chunk_tags)) if len(chunk_tags) != len( tokenization.tokenList.tokenList): raise Exception( "ERROR: incorrect number of chunks. expected=%d actual=%d" % (len(tokenization.tokenList.tokenList), len(chunk_tags))) metadata = concrete.AnnotationMetadata() metadata.tool = "Chunklink Constituency Converter" metadata.timestamp = long(time.time()) # Extract the chunks column and create a TokenTagging from it. chunks = concrete.TokenTagging() chunks.uuid = concrete_uuid.generate_UUID() chunks.metadata = metadata chunks.taggingType = "CHUNK" chunks.taggedTokenList = [] for i, chunk in enumerate(chunk_tags): tt = concrete.TaggedToken() tt.tokenIndex = i tt.tag = chunk chunks.taggedTokenList.append(tt) # Add chunks to the list of TokenTaggings. if not tokenization.tokenTaggingList: tokenization.tokenTaggingList = [] tokenization.tokenTaggingList.append(chunks) num_chunked += 1 except subprocess.CalledProcessError as e: logging.error("Chunklink failed on tree: %s" % (ptb_str)) if fail_on_error: raise e except Exception as e: logging.exception("Chunking failed on tokenization") if fail_on_error: raise e except Exception as e: logging.exception("Chunking failed on Communication") if fail_on_error: raise e return num_chunked, num_sents
def create_dummy_annotation(): '''Creates empty annotation to satisfy format''' ann = concrete.AnnotationMetadata() ann.tool = 'Quora Scrape Ingest' ann.timestamp = int(time.time()) return ann
def create_dummy_annotation(): ann = concrete.AnnotationMetadata() ann.tool = 'Quora Scrape Ingest' ann.timestamp = int(time.time()) return ann