def compress_uuids(input_path, output_path, verify=False, uuid_map_path=None, single_analytic=False): reader = CommunicationReader(input_path, add_references=False) writer = CommunicationWriterTGZ(output_path) if uuid_map_path is None: uuid_map_file = None else: uuid_map_file = open(uuid_map_path, 'w') for (i, (comm, comm_filename)) in enumerate(reader): (new_comm, uc) = _compress_uuids(comm, verify=verify, single_analytic=single_analytic) logging.info('compressed %s (%d analytics, %d uuids) (%d/?)' % (comm.id, len(uc.augs), len(uc.uuid_map), i + 1)) if uuid_map_file is not None: for (old_uuid, new_uuid) in sorted(uc.uuid_map.items(), key=lambda p: str(p[1])): uuid_map_file.write('%s %s\n' % (old_uuid, new_uuid)) writer.write(new_comm, comm_filename=comm_filename)
def test_CommunicationWriterTGZ_single_file_default_name(output_file): comm = read_communication_from_file("tests/testdata/simple_1.concrete") writer = CommunicationWriterTGZ() writer.open(output_file) writer.write(comm) writer.close() assert tarfile.is_tarfile(output_file) f = tarfile.open(output_file) tarinfo = f.next() assert tarinfo is not None assert comm.uuid.uuidString + '.concrete' == tarinfo.name assert tarinfo.isreg() assert tarinfo.mtime > time.time() - TIME_MARGIN assert os.stat('tests/testdata/simple_1.concrete').st_size == tarinfo.size assert 0644 == tarinfo.mode assert os.getuid() == tarinfo.uid assert pwd.getpwuid(os.getuid()).pw_name == tarinfo.uname assert os.getgid() == tarinfo.gid assert grp.getgrgid(os.getgid()).gr_name == tarinfo.gname tarinfo = f.next() assert tarinfo is None f.close()
def test_CommunicationWriterTGZ_single_file_default_name(output_file, login_info): comm = read_communication_from_file("tests/testdata/simple_1.concrete") writer = CommunicationWriterTGZ() try: writer.open(output_file) writer.write(comm) finally: writer.close() assert tarfile.is_tarfile(output_file) f = tarfile.open(output_file) tarinfo = f.next() assert tarinfo is not None assert comm.uuid.uuidString + ".concrete" == tarinfo.name assert tarinfo.isreg() assert tarinfo.mtime > time.time() - TIME_MARGIN assert os.stat("tests/testdata/simple_1.concrete").st_size == tarinfo.size assert 0644 == tarinfo.mode assert login_info["uid"] == tarinfo.uid assert login_info["username"] == tarinfo.uname assert login_info["gid"] == tarinfo.gid assert login_info["groupname"] == tarinfo.gname tarinfo = f.next() assert tarinfo is None f.close()
def main(): set_stdout_encoding() parser = ArgumentParser( formatter_class=ArgumentDefaultsHelpFormatter, description='Convert tarball of text files to' ' tarball of concrete communications', ) parser.set_defaults(annotation_level=AL_NONE, log_level='INFO', log_interval=1000) parser.add_argument('text_tarball_path', type=str, help='Input text tar file path (- for stdin)') parser.add_argument('concrete_tarball_path', type=str, help='Output concrete tar file path (- for stdout)') parser.add_argument('--per-line', action='store_true', help='Text files have one document per line (default:' ' each text file is a document)') parser.add_argument('--log-interval', type=int, help='Log an info message every log-interval docs') add_annotation_level_argparse_argument(parser) parser.add_argument('-l', '--loglevel', '--log-level', help='Logging verbosity level threshold (to stderr)', default='info') concrete.version.add_argparse_argument(parser) args = parser.parse_args() # Won't work on Windows text_tarball_path = ( '/dev/fd/0' if args.text_tarball_path == '-' else args.text_tarball_path ) concrete_tarball_path = ( '/dev/fd/1' if args.concrete_tarball_path == '-' else args.concrete_tarball_path ) per_line = args.per_line annotation_level = args.annotation_level logging.basicConfig(format='%(asctime)-15s %(levelname)s: %(message)s', level=args.loglevel.upper()) with CommunicationWriterTGZ(concrete_tarball_path) as writer: for (i, comm) in enumerate(load(text_tarball_path, per_line, annotation_level)): if (i + 1) % args.log_interval == 0: logging.info(u'writing doc %d (%s)...' % (i + 1, comm.id)) writer.write(comm, comm.id)
writer = codecs.getwriter("utf-8") reader = codecs.getreader("utf-8") if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", dest="input", help="Input file where each line is \"ID<tab>TAG<tab>TEXT\"") parser.add_argument("-o", "--output", dest="output", help="Tar file to write Communications to") parser.add_argument("-t", "--tag_type", dest="tag_type", default=None, help="Type of tag (e.g. \"language\"): defaults to None, in which case the tag column is ignored (but must still be present!)") options = parser.parse_args() ugf = AnalyticUUIDGeneratorFactory() ofd = CommunicationWriterTGZ(options.output) with reader(gzip.open(options.input)) as ifd: for i, line in enumerate(ifd): toks = line.strip().split("\t") if len(toks) != 3: continue cid, label, text = toks g = ugf.create() t = int(time()) comm = Communication(id=cid, uuid=g.next(), type="Text document", text=text, communicationTaggingList=[CommunicationTagging(uuid=g.next(), metadata=AnnotationMetadata(tool="Gold labeling", timestamp=t,
def main(): set_stdout_encoding() parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Interface with a Concrete FetchCommunicationService server. " "Supports either THttp/TJSONProtocol (using the '--uri' flag) " "or TSocket/TCompactProtocol (using '--host'/'--port')" ) parser.add_argument("--host", default="localhost", help="Hostname of TSocket/TCompactProtocol FetchCommunicationService") parser.add_argument("-p", "--port", type=int, default=9090, help="Port of TSocket/TCompactProtocol FetchCommunicationService") parser.add_argument('--uri', '--url', help="URI of THttpServer/TJSONProtocol FetchCommunicationService") parser.add_argument("--about", action="store_true", help="Print value of fetch_service.about()") parser.add_argument("--alive", action="store_true", help="Print value of fetch_service.alive()") parser.add_argument("--count", action="store_true", help="Print value of fetch_service.getCommunicationCount()") parser.add_argument("--get-ids", action="store_true", help="Print list of Communication IDs returned by " "fetch_service.getCommunicationIDs(offset, count). " "The offset and count parameters are set using the " "'--get-ids-offset' and '--get-ids-count' flags") parser.add_argument("--get-ids-offset", type=int, default=0, metavar="ID_OFFSET", help="Number of Communication IDs printed using the '--get-ids' flag") parser.add_argument("--get-ids-count", type=int, default=20, metavar="ID_COUNT", help="Offset for Communication IDs printed using the '--get-ids' flag") parser.add_argument("--save-as-tgz", metavar="TGZ_FILENAME", help="Save fetched Communications to a TGZ archive containing files " "named '[COMMUNICATION_ID].concrete'") parser.add_argument('-l', '--loglevel', '--log-level', help='Logging verbosity level threshold (to stderr)', default='info') parser.add_argument("comm_id", nargs="*", help="IDs of Communications to be fetched. " "If '-' is specified, a list of Communication" " IDs will be read from " "stdin, one Communication ID per line") parser.add_argument("--benchmark", action="store_true", help="Enable Thrift RPC timing instrument.") concrete.version.add_argparse_argument(parser) args = parser.parse_args() logging.basicConfig(format='%(asctime)-15s %(levelname)s: %(message)s', level=args.loglevel.upper()) if args.benchmark: if is_accelerated(): logging.info('Thrift acceleration is enabled.') else: logging.warning('Thrift acceleration is NOT enabled.') if args.uri: fetch_wrapper = HTTPFetchCommunicationClientWrapper(args.uri) else: fetch_wrapper = FetchCommunicationClientWrapper(args.host, args.port) with fetch_wrapper as client: if args.comm_id: fetch_request = FetchRequest() if len(args.comm_id) == 1 and args.comm_id[0] == '-': fetch_request.communicationIds = [line.strip() for line in sys.stdin.readlines()] else: fetch_request.communicationIds = args.comm_id if args.benchmark: start_time = time.time() fetch_result = client.fetch(fetch_request) if args.benchmark: logging.info('Time elapsed in fetch(): {:.4f}s'.format(time.time() - start_time)) print("Received FetchResult: '%s'" % fetch_result) if args.about: if args.benchmark: start_time = time.time() print("FetchCommunicationService.about() returned %s" % client.about()) if args.benchmark: logging.info('Time elapsed in about(): {:.4f}s'.format(time.time() - start_time)) if args.alive: if args.benchmark: start_time = time.time() print("FetchCommunicationService.alive() returned %s" % client.alive()) if args.benchmark: logging.info('Time elapsed in alive(): {:.4f}s'.format(time.time() - start_time)) if args.count: if args.benchmark: start_time = time.time() print("FetchCommunicationService.getCommunicationCount() returned %d" % client.getCommunicationCount()) if args.benchmark: logging.info('Time elapsed in getCommunicationCount(): {:.4f}s'.format( time.time() - start_time)) if args.get_ids: print("FetchCommunicationService.getCommunicationIDs(offset=%d, count=%d) returned:" % (args.get_ids_offset, args.get_ids_count)) if args.benchmark: start_time = time.time() ids = client.getCommunicationIDs(args.get_ids_offset, args.get_ids_count) if args.benchmark: logging.info('Time elapsed in getCommunicationIDs(): {:.4f}s'.format( time.time() - start_time)) for comm_id in ids: print(" %s" % comm_id) if args.save_as_tgz and args.comm_id: if fetch_result.communications: with CommunicationWriterTGZ(args.save_as_tgz) as writer: for comm in fetch_result.communications: comm_filename = '%s.concrete' % comm.id print("Saving Communication to TGZ archive '%s' as '%s'" % (args.save_as_tgz, comm_filename)) writer.write(comm, comm_filename)