def test_CommunicationWriterTGZ_single_file_default_name(output_file, login_info):
    comm = read_communication_from_file("tests/testdata/simple_1.concrete")
    writer = CommunicationWriterTGZ()
    try:
        writer.open(output_file)
        writer.write(comm)
    finally:
        writer.close()

    assert tarfile.is_tarfile(output_file)

    f = tarfile.open(output_file)

    tarinfo = f.next()
    assert tarinfo is not None

    assert comm.uuid.uuidString + ".concrete" == tarinfo.name
    assert tarinfo.isreg()
    assert tarinfo.mtime > time.time() - TIME_MARGIN
    assert os.stat("tests/testdata/simple_1.concrete").st_size == tarinfo.size
    assert 0644 == tarinfo.mode
    assert login_info["uid"] == tarinfo.uid
    assert login_info["username"] == tarinfo.uname
    assert login_info["gid"] == tarinfo.gid
    assert login_info["groupname"] == tarinfo.gname

    tarinfo = f.next()
    assert tarinfo is None

    f.close()
def compress_uuids(input_path,
                   output_path,
                   verify=False,
                   uuid_map_path=None,
                   single_analytic=False):
    reader = CommunicationReader(input_path, add_references=False)
    writer = CommunicationWriterTGZ(output_path)

    if uuid_map_path is None:
        uuid_map_file = None
    else:
        uuid_map_file = open(uuid_map_path, 'w')

    for (i, (comm, comm_filename)) in enumerate(reader):
        (new_comm, uc) = _compress_uuids(comm,
                                         verify=verify,
                                         single_analytic=single_analytic)

        logging.info('compressed %s (%d analytics, %d uuids) (%d/?)' %
                     (comm.id, len(uc.augs), len(uc.uuid_map), i + 1))

        if uuid_map_file is not None:
            for (old_uuid, new_uuid) in sorted(uc.uuid_map.items(),
                                               key=lambda p: str(p[1])):
                uuid_map_file.write('%s %s\n' % (old_uuid, new_uuid))

        writer.write(new_comm, comm_filename=comm_filename)
def test_CommunicationWriterTGZ_single_file_default_name(output_file):
    comm = read_communication_from_file("tests/testdata/simple_1.concrete")
    writer = CommunicationWriterTGZ()
    writer.open(output_file)
    writer.write(comm)
    writer.close()

    assert tarfile.is_tarfile(output_file)

    f = tarfile.open(output_file)

    tarinfo = f.next()
    assert tarinfo is not None

    assert comm.uuid.uuidString + '.concrete' == tarinfo.name
    assert tarinfo.isreg()
    assert tarinfo.mtime > time.time() - TIME_MARGIN
    assert os.stat('tests/testdata/simple_1.concrete').st_size == tarinfo.size
    assert 0644 == tarinfo.mode
    assert os.getuid() == tarinfo.uid
    assert pwd.getpwuid(os.getuid()).pw_name == tarinfo.uname
    assert os.getgid() == tarinfo.gid
    assert grp.getgrgid(os.getgid()).gr_name == tarinfo.gname

    tarinfo = f.next()
    assert tarinfo is None

    f.close()
Exemple #4
0
def compress_uuids(input_path, output_path, verify=False, uuid_map_path=None,
                   single_analytic=False):
    reader = CommunicationReader(input_path, add_references=False)
    writer = CommunicationWriterTGZ(output_path)

    if uuid_map_path is None:
        uuid_map_file = None
    else:
        uuid_map_file = open(uuid_map_path, 'w')

    for (i, (comm, comm_filename)) in enumerate(reader):
        (new_comm, uc) = _compress_uuids(comm, verify=verify,
                                         single_analytic=single_analytic)

        logging.info('compressed %s (%d analytics, %d uuids) (%d/?)'
                     % (comm.id, len(uc.augs), len(uc.uuid_map), i + 1))

        if uuid_map_file is not None:
            for (old_uuid, new_uuid) in sorted(uc.uuid_map.items(),
                                               key=lambda p: str(p[1])):
                uuid_map_file.write('%s %s\n' % (old_uuid, new_uuid))

        writer.write(new_comm, comm_filename=comm_filename)
Exemple #5
0
    ofd = CommunicationWriterTGZ(options.output)
    with reader(gzip.open(options.input)) as ifd:
        for i, line in enumerate(ifd):
            toks = line.strip().split("\t")
            if len(toks) != 3:
                continue            
            cid, label, text = toks
            g = ugf.create()
            t = int(time())
            comm = Communication(id=cid,
                                 uuid=g.next(),
                                 type="Text document",
                                 text=text,
                                 communicationTaggingList=[CommunicationTagging(uuid=g.next(),
                                                                                metadata=AnnotationMetadata(tool="Gold labeling",
                                                                                                            timestamp=t,
                                                                                                            kBest=1,
                                                                                ),
                                                                                taggingType=options.tag_type,
                                                                                tagList=[label],
                                                                                confidenceList=[1.0],
                                 )],
                                 metadata=AnnotationMetadata(tool="text_to_concrete.py ingester", timestamp=t, kBest=1),
                                 sectionList=[Section(uuid=g.next(),
                                                      textSpan=TextSpan(start=0, ending=len(text)),
                                                      kind="content",
                                                      )
                                 ])
            ofd.write(comm)
    ofd.close()