Esempio n. 1
0
def compress_uuids(input_path,
                   output_path,
                   verify=False,
                   uuid_map_path=None,
                   single_analytic=False):
    reader = CommunicationReader(input_path, add_references=False)
    writer = CommunicationWriterTGZ(output_path)

    if uuid_map_path is None:
        uuid_map_file = None
    else:
        uuid_map_file = open(uuid_map_path, 'w')

    for (i, (comm, comm_filename)) in enumerate(reader):
        (new_comm, uc) = _compress_uuids(comm,
                                         verify=verify,
                                         single_analytic=single_analytic)

        logging.info('compressed %s (%d analytics, %d uuids) (%d/?)' %
                     (comm.id, len(uc.augs), len(uc.uuid_map), i + 1))

        if uuid_map_file is not None:
            for (old_uuid, new_uuid) in sorted(uc.uuid_map.items(),
                                               key=lambda p: str(p[1])):
                uuid_map_file.write('%s %s\n' % (old_uuid, new_uuid))

        writer.write(new_comm, comm_filename=comm_filename)
def test_CommunicationWriterTGZ_single_file_default_name(output_file):
    comm = read_communication_from_file("tests/testdata/simple_1.concrete")
    writer = CommunicationWriterTGZ()
    writer.open(output_file)
    writer.write(comm)
    writer.close()

    assert tarfile.is_tarfile(output_file)

    f = tarfile.open(output_file)

    tarinfo = f.next()
    assert tarinfo is not None

    assert comm.uuid.uuidString + '.concrete' == tarinfo.name
    assert tarinfo.isreg()
    assert tarinfo.mtime > time.time() - TIME_MARGIN
    assert os.stat('tests/testdata/simple_1.concrete').st_size == tarinfo.size
    assert 0644 == tarinfo.mode
    assert os.getuid() == tarinfo.uid
    assert pwd.getpwuid(os.getuid()).pw_name == tarinfo.uname
    assert os.getgid() == tarinfo.gid
    assert grp.getgrgid(os.getgid()).gr_name == tarinfo.gname

    tarinfo = f.next()
    assert tarinfo is None

    f.close()
Esempio n. 3
0
def test_CommunicationWriterTGZ_single_file_default_name(output_file, login_info):
    comm = read_communication_from_file("tests/testdata/simple_1.concrete")
    writer = CommunicationWriterTGZ()
    try:
        writer.open(output_file)
        writer.write(comm)
    finally:
        writer.close()

    assert tarfile.is_tarfile(output_file)

    f = tarfile.open(output_file)

    tarinfo = f.next()
    assert tarinfo is not None

    assert comm.uuid.uuidString + ".concrete" == tarinfo.name
    assert tarinfo.isreg()
    assert tarinfo.mtime > time.time() - TIME_MARGIN
    assert os.stat("tests/testdata/simple_1.concrete").st_size == tarinfo.size
    assert 0644 == tarinfo.mode
    assert login_info["uid"] == tarinfo.uid
    assert login_info["username"] == tarinfo.uname
    assert login_info["gid"] == tarinfo.gid
    assert login_info["groupname"] == tarinfo.gname

    tarinfo = f.next()
    assert tarinfo is None

    f.close()
Esempio n. 4
0
def compress_uuids(input_path, output_path, verify=False, uuid_map_path=None,
                   single_analytic=False):
    reader = CommunicationReader(input_path, add_references=False)
    writer = CommunicationWriterTGZ(output_path)

    if uuid_map_path is None:
        uuid_map_file = None
    else:
        uuid_map_file = open(uuid_map_path, 'w')

    for (i, (comm, comm_filename)) in enumerate(reader):
        (new_comm, uc) = _compress_uuids(comm, verify=verify,
                                         single_analytic=single_analytic)

        logging.info('compressed %s (%d analytics, %d uuids) (%d/?)'
                     % (comm.id, len(uc.augs), len(uc.uuid_map), i + 1))

        if uuid_map_file is not None:
            for (old_uuid, new_uuid) in sorted(uc.uuid_map.items(),
                                               key=lambda p: str(p[1])):
                uuid_map_file.write('%s %s\n' % (old_uuid, new_uuid))

        writer.write(new_comm, comm_filename=comm_filename)
Esempio n. 5
0
def main():
    set_stdout_encoding()

    parser = ArgumentParser(
        formatter_class=ArgumentDefaultsHelpFormatter,
        description='Convert tarball of text files to'
                    ' tarball of concrete communications',
    )
    parser.set_defaults(annotation_level=AL_NONE,
                        log_level='INFO', log_interval=1000)
    parser.add_argument('text_tarball_path', type=str,
                        help='Input text tar file path (- for stdin)')
    parser.add_argument('concrete_tarball_path', type=str,
                        help='Output concrete tar file path (- for stdout)')
    parser.add_argument('--per-line', action='store_true',
                        help='Text files have one document per line (default:'
                             ' each text file is a document)')
    parser.add_argument('--log-interval', type=int,
                        help='Log an info message every log-interval docs')
    add_annotation_level_argparse_argument(parser)
    parser.add_argument('-l', '--loglevel', '--log-level',
                        help='Logging verbosity level threshold (to stderr)',
                        default='info')
    concrete.version.add_argparse_argument(parser)
    args = parser.parse_args()

    # Won't work on Windows
    text_tarball_path = (
        '/dev/fd/0'
        if args.text_tarball_path == '-'
        else args.text_tarball_path
    )
    concrete_tarball_path = (
        '/dev/fd/1'
        if args.concrete_tarball_path == '-'
        else args.concrete_tarball_path
    )
    per_line = args.per_line
    annotation_level = args.annotation_level

    logging.basicConfig(format='%(asctime)-15s %(levelname)s: %(message)s',
                        level=args.loglevel.upper())

    with CommunicationWriterTGZ(concrete_tarball_path) as writer:
        for (i, comm) in enumerate(load(text_tarball_path, per_line,
                                        annotation_level)):
            if (i + 1) % args.log_interval == 0:
                logging.info(u'writing doc %d (%s)...' % (i + 1, comm.id))
            writer.write(comm, comm.id)
Esempio n. 6
0
writer = codecs.getwriter("utf-8")
reader = codecs.getreader("utf-8")

if __name__ == "__main__":

    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input", dest="input", help="Input file where each line is \"ID<tab>TAG<tab>TEXT\"")
    parser.add_argument("-o", "--output", dest="output", help="Tar file to write Communications to")
    parser.add_argument("-t", "--tag_type", dest="tag_type", default=None, help="Type of tag (e.g. \"language\"): defaults to None, in which case the tag column is ignored (but must still be present!)")
    options = parser.parse_args()

    ugf = AnalyticUUIDGeneratorFactory()
    ofd = CommunicationWriterTGZ(options.output)
    with reader(gzip.open(options.input)) as ifd:
        for i, line in enumerate(ifd):
            toks = line.strip().split("\t")
            if len(toks) != 3:
                continue            
            cid, label, text = toks
            g = ugf.create()
            t = int(time())
            comm = Communication(id=cid,
                                 uuid=g.next(),
                                 type="Text document",
                                 text=text,
                                 communicationTaggingList=[CommunicationTagging(uuid=g.next(),
                                                                                metadata=AnnotationMetadata(tool="Gold labeling",
                                                                                                            timestamp=t,
Esempio n. 7
0
def main():
    set_stdout_encoding()

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Interface with a Concrete FetchCommunicationService server. "
                    "Supports either THttp/TJSONProtocol (using the '--uri' flag) "
                    "or TSocket/TCompactProtocol (using '--host'/'--port')"
    )
    parser.add_argument("--host", default="localhost",
                        help="Hostname of TSocket/TCompactProtocol FetchCommunicationService")
    parser.add_argument("-p", "--port", type=int, default=9090,
                        help="Port of TSocket/TCompactProtocol FetchCommunicationService")
    parser.add_argument('--uri', '--url',
                        help="URI of THttpServer/TJSONProtocol FetchCommunicationService")
    parser.add_argument("--about", action="store_true",
                        help="Print value of fetch_service.about()")
    parser.add_argument("--alive", action="store_true",
                        help="Print value of fetch_service.alive()")
    parser.add_argument("--count", action="store_true",
                        help="Print value of fetch_service.getCommunicationCount()")
    parser.add_argument("--get-ids", action="store_true",
                        help="Print list of Communication IDs returned by "
                             "fetch_service.getCommunicationIDs(offset, count).  "
                             "The offset and count parameters are set using the "
                             "'--get-ids-offset' and '--get-ids-count' flags")
    parser.add_argument("--get-ids-offset", type=int, default=0, metavar="ID_OFFSET",
                        help="Number of Communication IDs printed using the '--get-ids' flag")
    parser.add_argument("--get-ids-count", type=int, default=20, metavar="ID_COUNT",
                        help="Offset for Communication IDs printed using the '--get-ids' flag")
    parser.add_argument("--save-as-tgz", metavar="TGZ_FILENAME",
                        help="Save fetched Communications to a TGZ archive containing files "
                             "named '[COMMUNICATION_ID].concrete'")
    parser.add_argument('-l', '--loglevel', '--log-level',
                        help='Logging verbosity level threshold (to stderr)',
                        default='info')
    parser.add_argument("comm_id", nargs="*", help="IDs of Communications to be fetched. "
                                                   "If '-' is specified, a list of Communication"
                                                   " IDs will be read from "
                                                   "stdin, one Communication ID per line")
    parser.add_argument("--benchmark", action="store_true",
                        help="Enable Thrift RPC timing instrument.")
    concrete.version.add_argparse_argument(parser)
    args = parser.parse_args()

    logging.basicConfig(format='%(asctime)-15s %(levelname)s: %(message)s',
                        level=args.loglevel.upper())

    if args.benchmark:
        if is_accelerated():
            logging.info('Thrift acceleration is enabled.')
        else:
            logging.warning('Thrift acceleration is NOT enabled.')

    if args.uri:
        fetch_wrapper = HTTPFetchCommunicationClientWrapper(args.uri)
    else:
        fetch_wrapper = FetchCommunicationClientWrapper(args.host, args.port)

    with fetch_wrapper as client:
        if args.comm_id:
            fetch_request = FetchRequest()
            if len(args.comm_id) == 1 and args.comm_id[0] == '-':
                fetch_request.communicationIds = [line.strip() for line in sys.stdin.readlines()]
            else:
                fetch_request.communicationIds = args.comm_id

            if args.benchmark:
                start_time = time.time()
            fetch_result = client.fetch(fetch_request)
            if args.benchmark:
                logging.info('Time elapsed in fetch(): {:.4f}s'.format(time.time() - start_time))
            print("Received FetchResult: '%s'" % fetch_result)

        if args.about:
            if args.benchmark:
                start_time = time.time()
            print("FetchCommunicationService.about() returned %s" % client.about())
            if args.benchmark:
                logging.info('Time elapsed in about(): {:.4f}s'.format(time.time() - start_time))
        if args.alive:
            if args.benchmark:
                start_time = time.time()
            print("FetchCommunicationService.alive() returned %s" % client.alive())
            if args.benchmark:
                logging.info('Time elapsed in alive(): {:.4f}s'.format(time.time() - start_time))
        if args.count:
            if args.benchmark:
                start_time = time.time()
            print("FetchCommunicationService.getCommunicationCount() returned %d" %
                  client.getCommunicationCount())
            if args.benchmark:
                logging.info('Time elapsed in getCommunicationCount(): {:.4f}s'.format(
                    time.time() - start_time))
        if args.get_ids:
            print("FetchCommunicationService.getCommunicationIDs(offset=%d, count=%d) returned:" %
                  (args.get_ids_offset, args.get_ids_count))
            if args.benchmark:
                start_time = time.time()
            ids = client.getCommunicationIDs(args.get_ids_offset, args.get_ids_count)
            if args.benchmark:
                logging.info('Time elapsed in getCommunicationIDs(): {:.4f}s'.format(
                    time.time() - start_time))
            for comm_id in ids:
                print("  %s" % comm_id)

        if args.save_as_tgz and args.comm_id:
            if fetch_result.communications:
                with CommunicationWriterTGZ(args.save_as_tgz) as writer:
                    for comm in fetch_result.communications:
                        comm_filename = '%s.concrete' % comm.id
                        print("Saving Communication to TGZ archive '%s' as '%s'" %
                              (args.save_as_tgz, comm_filename))
                        writer.write(comm, comm_filename)