def test_validate_minimal_communication_with_uuid(): comm = Communication() comm.id = "myID" comm.metadata = AnnotationMetadata( tool="TEST", timestamp=int(time.time())) comm.type = "Test Communication" comm.uuid = generate_UUID() assert validate_communication(comm)
def _next_from_stream(self): try: comm = Communication() comm.read(self.protocol) if self._add_references: add_references_to_communication(comm) return (comm, self._source_filename) except EOFError: self.transport.close() raise StopIteration
def create_simple_comm(comm_id, sentence_string="Super simple sentence ."): """Create a simple (valid) Communication suitable for testing purposes The Communication will have a single Section containing a single Sentence. Args: - `comm_id`: A string specifying a Communication ID - `sentence_string`: A string to be used for the sentence text. The string will be whitespace-tokenized. Returns: - A Concrete Communication object """ logging.warning('create_simple_comm will be removed in a future' ' release, please use create_comm instead') toolname = "TEST" timestamp = int(time.time()) augf = AnalyticUUIDGeneratorFactory() aug = augf.create() comm = Communication(id=comm_id, metadata=AnnotationMetadata(tool=toolname, timestamp=timestamp), type=toolname, uuid=aug.next()) tokenization = Tokenization(kind=TokenizationKind.TOKEN_LIST, metadata=AnnotationMetadata( tool=toolname, timestamp=timestamp), tokenList=TokenList(tokenList=[]), uuid=aug.next()) token_string_list = sentence_string.split() for i, token_string in enumerate(token_string_list): tokenization.tokenList.tokenList.append( Token(text=token_string, tokenIndex=i)) sentence = Sentence(textSpan=TextSpan(0, len(sentence_string)), tokenization=tokenization, uuid=aug.next()) section = Section(kind="SectionKind", sentenceList=[sentence], textSpan=TextSpan(0, len(sentence_string)), uuid=aug.next()) comm.sectionList = [section] comm.text = sentence_string return comm
def read_communication_from_buffer(buf, add_references=True): ''' Deserialize buf (a binary string) and return resulting communication. Add references if requested. ''' transport_in = TMemoryBuffer(buf) protocol_in = factory.createProtocol(transport_in) comm = Communication() comm.read(protocol_in) if add_references: add_references_to_communication(comm) return comm
def main(): set_stdout_encoding() parser = make_parser() args = parser.parse_args() logging.basicConfig(format='%(asctime)-15s %(levelname)s: %(message)s', level=args.loglevel.upper()) mimetypes.init() (ifile_type, ifile_encoding) = mimetypes.guess_type(args.input_file) (ofile_type, ofile_encoding) = mimetypes.guess_type(args.output_file) out_writer = None if args.direction is None: if args.iprotocol is None or args.oprotocol is None: print("Either --direction, or both --iprotocol and --oprotocol," " must be provided") exit(1) else: if (args.iprotocol is not None) or (args.oprotocol is not None): print("Not both --direction, and either --iprotocol or" " --oprotocol, can be provided") exit(1) encoding_input = KNOWN_CONVERSIONS[ args.direction][0] if args.iprotocol is None else PROTOCOLS[ args.iprotocol] encoding_output = KNOWN_CONVERSIONS[ args.direction][1] if args.oprotocol is None else PROTOCOLS[ args.oprotocol] if ofile_encoding == "gzip": out_writer = gzip.GzipFile(args.output_file, 'wb') else: out_writer = open(args.output_file, 'w') if ifile_encoding == 'gzip': f = gzip.GzipFile(args.input_file) transportIn = TTransport.TFileObjectTransport(f) protocolIn = encoding_input().getProtocol(transportIn) while True: try: comm = Communication() comm.read(protocolIn) output_bytes = TSerialization.serialize( comm, protocol_factory=encoding_output()) out_writer.write(output_bytes) except EOFError: break f.close() else: convert(input_file_path=args.input_file, output_file_path=args.output_file, input_protocol_factory=encoding_input, output_protocol_factory=encoding_output) out_writer.close()
def main(): set_stdout_encoding() parser = make_parser() args = parser.parse_args() logging.basicConfig(format='%(asctime)-15s %(levelname)s: %(message)s', level=args.loglevel.upper()) mimetypes.init() (ifile_type, ifile_encoding) = mimetypes.guess_type(args.input_file) (ofile_type, ofile_encoding) = mimetypes.guess_type(args.output_file) out_writer = None if args.direction is None: if args.iprotocol is None or args.oprotocol is None: print("Either --direction, or both --iprotocol and --oprotocol," " must be provided") exit(1) else: if (args.iprotocol is not None) or (args.oprotocol is not None): print("Not both --direction, and either --iprotocol or" " --oprotocol, can be provided") exit(1) encoding_input = KNOWN_CONVERSIONS[args.direction][ 0] if args.iprotocol is None else PROTOCOLS[args.iprotocol] encoding_output = KNOWN_CONVERSIONS[args.direction][ 1] if args.oprotocol is None else PROTOCOLS[args.oprotocol] if ofile_encoding == "gzip": out_writer = gzip.GzipFile(args.output_file, 'wb') else: out_writer = open(args.output_file, 'w') if ifile_encoding == 'gzip': f = gzip.GzipFile(args.input_file) transportIn = TTransport.TFileObjectTransport(f) protocolIn = encoding_input().getProtocol(transportIn) while True: try: comm = Communication() comm.read(protocolIn) output_bytes = TSerialization.serialize( comm, protocol_factory=encoding_output()) out_writer.write(output_bytes) except EOFError: break f.close() else: convert(input_file_path=args.input_file, output_file_path=args.output_file, input_protocol_factory=encoding_input, output_protocol_factory=encoding_output) out_writer.close()
def createComm(cid, ctype, txt): '''Creates concrete communication file from text''' comm = Communication() comm.id = cid comm.uuid = concrete.util.generate_UUID() comm.type = ctype txt = re.sub('[\xa0\xc2]', ' ', txt) txt = re.sub(r'\s*\n\s*', '\n', txt) if not txt.strip(): return None comm.text = txt comm.metadata = create_dummy_annotation() breaks = [ i for i, ch in enumerate(txt) if ch == '\n' and i > 0 and txt[i - 1] != '\n' ] if not breaks or breaks[-1] != len(txt) - 1: breaks += [len(txt)] sections = [] start = 0 for i in breaks: sec = concrete.Section() sec.uuid = concrete.util.generate_UUID() sec.kind = "Passage" sec.textSpan = concrete.TextSpan(start, i) sections.append(sec) start = i comm.sectionList = sections if not concrete.validate.validate_communication(comm): return None return comm
def commFromData(data): '''Returns Communication object generated from byte string''' comm = Communication() TSerialization.deserialize( comm, data, protocol_factory=TCompactProtocol.TCompactProtocolFactory()) return comm
def _next_from_zip(self): if self.zip_infolist_index >= len(self.zip_infolist): raise StopIteration zipinfo = self.zip_infolist[self.zip_infolist_index] self.zip_infolist_index += 1 comm = TSerialization.deserialize( Communication(), self.zip.open(zipinfo).read(), protocol_factory=factory.protocolFactory) if self._add_references: add_references_to_communication(comm) return (comm, zipinfo.filename)
def test_validate_minimal_communication_with_uuid(): comm = Communication() comm.id = "myID" comm.metadata = AnnotationMetadata(tool="TEST", timestamp=int(time.time())) comm.type = "Test Communication" comm.uuid = generate_UUID() assert validate_communication(comm)
def json_to_concrete(doc: Dict) -> Communication: metadata = AnnotationMetadata( tool="BlingBLing", timestamp=int(datetime.datetime.now().timestamp()) ) comm: Communication = Communication( uuid=augf.next(), id=doc['doc_key'], type="aida", metadata=metadata, lidList=[LanguageIdentification( uuid=augf.next(), metadata=metadata, languageToProbabilityMap={doc['language_id']: 1.0} )], sectionList=[Section( uuid=augf.next(), kind="passage", sentenceList=[ Sentence( uuid=augf.next(), tokenization=Tokenization( uuid=augf.next(), kind=TokenizationKind.TOKEN_LIST, metadata=metadata, tokenList=TokenList( tokenList=[ Token( tokenIndex=i, text=t ) for i, t in enumerate(get_flatten_sentence(doc)) ] ) ) ) ] )], entityMentionSetList=[EntityMentionSet( uuid=augf.next(), metadata=metadata, mentionList=[] )], situationMentionSetList=[SituationMentionSet( uuid=augf.next(), metadata=metadata, mentionList=[] )] ) return comm
def _comm_with_properties(num_properties): ts = 17 meta_tokn = AnnotationMetadata(tool='tokn-tool', timestamp=ts) toks = TokenList(tokenList=[ Token(tokenIndex=0, text='text', textSpan=TextSpan(start=0, ending=1)) ]) tokn = Tokenization(uuid=generate_UUID(), metadata=meta_tokn, kind=TokenizationKind.TOKEN_LIST, tokenList=toks) sentence = Sentence(uuid=generate_UUID(), tokenization=tokn) section = Section(uuid=generate_UUID(), kind='kind', label='label', sentenceList=[sentence]) trfs = TokenRefSequence(tokenizationId=tokn.uuid, tokenIndexList=[0], anchorTokenIndex=0) em = EntityMention(uuid=generate_UUID(), entityType='entityType', text='text', tokens=trfs) meta_ems = AnnotationMetadata(tool='ems-tool', timestamp=ts) ems = EntityMentionSet(uuid=generate_UUID(), metadata=meta_ems, mentionList=[em]) meta_prop = AnnotationMetadata(tool='Annotator1', timestamp=ts) props = list( Property(value="Property%d" % i, metadata=meta_prop, polarity=4.0) for i in range(num_properties)) am = MentionArgument(role='role', entityMentionId=em.uuid, propertyList=props) sm = SituationMention(uuid=generate_UUID(), tokens=trfs, argumentList=[am]) meta_sms = AnnotationMetadata(tool='sms-tool', timestamp=ts) sms = SituationMentionSet(uuid=generate_UUID(), metadata=meta_sms, mentionList=[sm]) meta_comm = AnnotationMetadata(tool='tool', timestamp=ts) comm = Communication(uuid=generate_UUID(), id='id', text='text', type='type', metadata=meta_comm, sectionList=[section], situationMentionSetList=[sms], entityMentionSetList=[ems]) add_references_to_communication(comm) return comm
def json_tweet_object_to_Communication(tweet): """ """ tweet_info = json_tweet_object_to_TweetInfo(tweet) augf = AnalyticUUIDGeneratorFactory() aug = augf.create() if 'id_str' in tweet: tweet_id = tweet['id_str'] else: logging.warning('Tweet has no id_str, leaving communication id blank') tweet_id = None tweet_time = datetime_to_timestamp(datetime.strptime(tweet_info.createdAt, CREATED_AT_FORMAT)) comm = Communication( communicationMetadata=CommunicationMetadata( tweetInfo=tweet_info), metadata=AnnotationMetadata( tool=TOOL_NAME, timestamp=int(time.time())), originalText=tweet_info.text, text=tweet_info.text, type=TWEET_TYPE, uuid=aug.next(), startTime=tweet_time, endTime=tweet_time, id=tweet_id ) # either this, or pass in gen as parameter to fx # latter is more annoying to test but slightly cleaner if tweet_info.lid is not None: tweet_info.lid.uuid = aug.next() lidList = [tweet_info.lid] comm.lidList = lidList return comm
def json_tweet_object_to_Communication(tweet): """ """ tweet_info = json_tweet_object_to_TweetInfo(tweet) augf = AnalyticUUIDGeneratorFactory() aug = augf.create() if 'id_str' in tweet: tweet_id = tweet['id_str'] else: logging.warning('Tweet has no id_str, leaving communication id blank') tweet_id = None tweet_time = unix_time(datetime.strptime(tweet_info.createdAt, CREATED_AT_FORMAT)) comm = Communication( communicationMetadata=CommunicationMetadata( tweetInfo=tweet_info), metadata=AnnotationMetadata( tool=TOOL_NAME, timestamp=int(time.time())), originalText=tweet_info.text, text=tweet_info.text, type=TWEET_TYPE, uuid=aug.next(), startTime=tweet_time, endTime=tweet_time, id=tweet_id ) # either this, or pass in gen as parameter to fx # latter is more annoying to test but slightly cleaner if tweet_info.lid is not None: tweet_info.lid.uuid = aug.next() lidList = [tweet_info.lid] comm.lidList = lidList return comm
def read_communication_from_file(communication_filename, add_references=True): """Read a Communication from the file specified by filename Args: - `communication_filename`: String with filename Returns: - A Concrete `Communication` object """ comm = read_thrift_from_file(Communication(), communication_filename) if add_references: add_references_to_communication(comm) return comm
def create_comm(comm_id, text='', comm_type='article', section_kind='passage', metadata_tool='concrete-python', metadata_timestamp=None, annotation_level=AL_TOKEN): ''' Create a simple, valid Communication from text. By default the text will be split by double-newlines into sections and then by single newlines into sentences within those sections. annotation_level controls the amount of annotation that is added: AL_NONE add no optional annotations (not even sections) AL_SECTION add sections but not sentences AL_SENTENCE add sentences but not tokens AL_TOKEN add all annotations, up to tokens (the default) If metadata_timestamp is None, the current time will be used. ''' if metadata_timestamp is None: metadata_timestamp = int(time.time()) augf = AnalyticUUIDGeneratorFactory() aug = augf.create() sections = (annotation_level is not None) and (annotation_level != AL_NONE) return Communication( id=comm_id, uuid=aug.next(), type=comm_type, text=text, metadata=AnnotationMetadata( tool=metadata_tool, timestamp=metadata_timestamp, ), sectionList=([ create_section(sec_text, sec_start, sec_end, section_kind, aug, metadata_tool, metadata_timestamp, annotation_level) for (sec_text, sec_start, sec_end) in _split(text, '\n\n') ] if text.strip() else []) if sections else None, )
def main(): parser = argparse.ArgumentParser() parser.add_argument('csv_file') parser.add_argument('comms_zip_file') parser.add_argument('--comm-field', default='Answer.modified_comm') args = parser.parse_args() csv_fh = open(args.csv_file, 'rb') reader = unicodecsv.DictReader(csv_fh) with CommunicationWriterZip(args.comms_zip_file) as writer: for row in reader: json_comm = row[args.comm_field] comm = Communication() TSerialization.deserialize( comm, json_comm.encode('utf-8'), protocol_factory=TJSONProtocol.TJSONProtocolFactory()) writer.write(comm, comm.id + '.comm')
def convert_communication(input_bytes, input_protocol_factory, output_protocol_factory): """ Convert an input byte stream (to be read in as an input_protocol_factory type) to an output byte stream (with encoding output_protocol_factory type). * input_bytes: Input file byte stream * input_protocol_factory: Callable factory function for input encoding, e.g., TBinaryProtocol.TBinaryProtocolFactory. * output_protocol_factory: Callable factory function for output encoding, e.g., TCompactProtocol.TCompactProtocolFactory. """ comm = Communication() TSerialization.deserialize(comm, input_bytes, protocol_factory=input_protocol_factory()) output_bytes = TSerialization.serialize( comm, protocol_factory=output_protocol_factory()) return output_bytes
def createComm(fn): with codecs.open(fn, 'r', 'utf-8') as f: txt = f.read() comm = Communication() comm.id = fn comm.uuid = concrete.util.generate_UUID() comm.type = "QUORA ANSWER" if fn.split('/')[-1].startswith( "answer") else "QUORA QUESTION" txt = re.sub('[\xa0\xc2]', ' ', txt) txt = re.sub(r'\s*\n\s*', '\n', txt) if not txt.strip(): return None comm.text = txt comm.metadata = create_dummy_annotation() breaks = [ i for i, ch in enumerate(txt) if ch == '\n' and i > 0 and txt[i - 1] != '\n' ] if not breaks or breaks[-1] != len(txt) - 1: breaks += [len(txt)] sections = [] start = 0 for i in breaks: sec = concrete.Section() sec.uuid = concrete.util.generate_UUID() sec.kind = "Passage" sec.textSpan = concrete.TextSpan(start, i) sections.append(sec) start = i comm.sectionList = sections if not concrete.validate.validate_communication(comm): return None return comm
def _next_from_tar(self): while True: tarinfo = self.tar.next() if tarinfo is None: raise StopIteration if not tarinfo.isfile(): # Ignore directories continue filename = os.path.split(tarinfo.name)[-1] if filename[0] is '.' and filename[1] is '_': # Ignore attribute files created by OS X tar continue comm = TSerialization.deserialize( Communication(), self.tar.extractfile(tarinfo).read(), protocol_factory=factory.protocolFactory) if self._add_references: add_references_to_communication(comm) # hack to keep memory usage O(1) # (...but the real hack is tarfile :) self.tar.members = [] return (comm, tarinfo.name)
def index(): text = request.forms.get('text') transport = TTransport.TFramedTransport( TSocket.TSocket(options.annotator_host, options.annotator_port)) protocol = TCompactProtocol.TCompactProtocol(transport) client = Annotator.Client(protocol) transport.open() augf = AnalyticUUIDGeneratorFactory() aug = augf.create() c = Communication( id="", text=text, uuid=aug.next(), type="user-supplied input", metadata=AnnotationMetadata(timestamp=int(time.time()), tool="stdin"), sectionList=[ Section(uuid=aug.next(), sentenceList=[], kind="paragraph", textSpan=TextSpan(start=0, ending=len(text))) ], entitySetList=[], entityMentionSetList=[], ) new_c = client.annotate(c) form = '''<form action="/" method="post"> Enter or paste some text: <input name="text" type="text" /> <input value="Submit" type="submit" /> </form> ''' return form + "\n".join(["<h3>%s</h3>" % text] + [ "\n".join([ "<br>%s %s" % (e.type, e.canonicalName) for e in es.entityList ]) for es in new_c.entitySetList ])
" --oprotocol, can be provided") exit(1) encoding_input = KNOWN_CONVERSIONS[args.direction][ 0] if args.iprotocol is None else PROTOCOLS[args.iprotocol] encoding_output = KNOWN_CONVERSIONS[args.direction][ 1] if args.oprotocol is None else PROTOCOLS[args.oprotocol] if ofile_encoding == "gzip": out_writer = gzip.GzipFile(args.output_file, 'wb') else: out_writer = open(args.output_file, 'w') if ifile_encoding == 'gzip': f = gzip.GzipFile(args.input_file) transportIn = TTransport.TFileObjectTransport(f) protocolIn = encoding_input().getProtocol(transportIn) while True: try: comm = Communication() comm.read(protocolIn) output_bytes = TSerialization.serialize( comm, protocol_factory=encoding_output()) out_writer.write(output_bytes) except EOFError: break f.close() else: convert(input_file_path=args.input_file, output_file_path=args.output_file, input_protocol_factory=encoding_input, output_protocol_factory=encoding_output) out_writer.close()
encoding_input = KNOWN_CONVERSIONS[ args.direction][0] if args.iprotocol is None else PROTOCOLS[ args.iprotocol] encoding_output = KNOWN_CONVERSIONS[ args.direction][1] if args.oprotocol is None else PROTOCOLS[ args.oprotocol] if ofile_encoding == "gzip": out_writer = gzip.GzipFile(args.output_file, 'wb') else: out_writer = open(args.output_file, 'w') if ifile_encoding == 'gzip': f = gzip.GzipFile(args.input_file) transportIn = TTransport.TFileObjectTransport(f) protocolIn = encoding_input().getProtocol(transportIn) while True: try: comm = Communication() comm.read(protocolIn) output_bytes = TSerialization.serialize( comm, protocol_factory=encoding_output()) out_writer.write(output_bytes) except EOFError: break f.close() else: convert(input_file_path=args.input_file, output_file_path=args.output_file, input_protocol_factory=encoding_input, output_protocol_factory=encoding_output) out_writer.close()
def test_next(self): comm = Communication() comm.uuid = generate_UUID()
def test_generate_UUID(): comm = Communication() comm.uuid = generate_UUID()
parser.add_argument("-p", "--port", dest="port", type=int, default=9090) parser.add_argument("-H", "--host", dest="host", default="localhost") options = parser.parse_args() # Make socket transport = TSocket.TSocket(options.host, options.port) # Buffering is critical. Raw sockets are very slow transport = TTransport.TBufferedTransport(transport) # Wrap in a protocol protocol = TCompactProtocol.TCompactProtocol(transport) # Create a client to use the protocol encoder client = Annotator.Client(protocol) # Connect! transport.open() while True: s = raw_input("Write some text > ") if re.match(r"^\s*$", s): break else: augf = AnalyticUUIDGeneratorFactory() aug = augf.create() c = Communication(id="", text=s, uuid=aug.next(), type="tweet", metadata=AnnotationMetadata(timestamp=0, tool="stdin"), lidList=[]) new_c = client.annotate(c) print new_c
def create_simple_comm(comm_id, sentence_string="Super simple sentence ."): """Create a simple (valid) Communication suitable for testing purposes The Communication will have a single Section containing a single Sentence. Args: - `comm_id`: A string specifying a Communication ID - `sentence_string`: A string to be used for the sentence text. The string will be whitespace-tokenized. Returns: - A Concrete Communication object """ logging.warning('create_simple_comm will be removed in a future' ' release, please use create_comm instead') toolname = "TEST" timestamp = int(time.time()) augf = AnalyticUUIDGeneratorFactory() aug = augf.create() comm = Communication( id=comm_id, metadata=AnnotationMetadata(tool=toolname, timestamp=timestamp), type=toolname, uuid=aug.next() ) tokenization = Tokenization( kind=TokenizationKind.TOKEN_LIST, metadata=AnnotationMetadata(tool=toolname, timestamp=timestamp), tokenList=TokenList( tokenList=[]), uuid=aug.next() ) token_string_list = sentence_string.split() for i, token_string in enumerate(token_string_list): tokenization.tokenList.tokenList.append(Token(text=token_string, tokenIndex=i)) sentence = Sentence( textSpan=TextSpan(0, len(sentence_string)), tokenization=tokenization, uuid=aug.next() ) section = Section( kind="SectionKind", sentenceList=[sentence], textSpan=TextSpan(0, len(sentence_string)), uuid=aug.next() ) comm.sectionList = [section] comm.text = sentence_string return comm
transport.open() while True: s = raw_input("Write some text > ") if re.match(r"^\s*$", s): break else: augf = AnalyticUUIDGeneratorFactory() aug = augf.create() c = Communication( id="", text=s, uuid=aug.next(), type="user-supplied input", metadata=AnnotationMetadata(timestamp=int(time.time()), tool="stdin"), sectionList=[ Section(uuid=aug.next(), sentenceList=[], kind="paragraph", textSpan=TextSpan(start=0, ending=len(s))) ], entitySetList=[], entityMentionSetList=[], ) new_c = client.annotate(c) for es in new_c.entitySetList: for e in es.entityList: print "%s %s" % (e.type, e.canonicalName)
ofd = CommunicationWriterTGZ(options.output) with reader(gzip.open(options.input)) as ifd: for i, line in enumerate(ifd): toks = line.strip().split("\t") if len(toks) != 3: continue cid, label, text = toks g = ugf.create() t = int(time()) comm = Communication(id=cid, uuid=g.next(), type="Text document", text=text, communicationTaggingList=[CommunicationTagging(uuid=g.next(), metadata=AnnotationMetadata(tool="Gold labeling", timestamp=t, kBest=1, ), taggingType=options.tag_type, tagList=[label], confidenceList=[1.0], )], metadata=AnnotationMetadata(tool="text_to_concrete.py ingester", timestamp=t, kBest=1), sectionList=[Section(uuid=g.next(), textSpan=TextSpan(start=0, ending=len(text)), kind="content", ) ]) ofd.write(comm) ofd.close()