Ejemplo n.º 1
0
def test_CommunicationWriterTGZ_single_file_default_name(output_file, login_info):
    comm = read_communication_from_file("tests/testdata/simple_1.concrete")
    writer = CommunicationWriterTGZ()
    try:
        writer.open(output_file)
        writer.write(comm)
    finally:
        writer.close()

    assert tarfile.is_tarfile(output_file)

    f = tarfile.open(output_file)

    tarinfo = f.next()
    assert tarinfo is not None

    assert comm.uuid.uuidString + ".concrete" == tarinfo.name
    assert tarinfo.isreg()
    assert tarinfo.mtime > time.time() - TIME_MARGIN
    assert os.stat("tests/testdata/simple_1.concrete").st_size == tarinfo.size
    assert 0644 == tarinfo.mode
    assert login_info["uid"] == tarinfo.uid
    assert login_info["username"] == tarinfo.uname
    assert login_info["gid"] == tarinfo.gid
    assert login_info["groupname"] == tarinfo.gname

    tarinfo = f.next()
    assert tarinfo is None

    f.close()
Ejemplo n.º 2
0
def validate_communication_file(communication_filename):
    logging.info(
        _ilm(
            0, "Opening Concrete Communication with filename '%s'" %
            communication_filename))
    comm = read_communication_from_file(communication_filename)
    validate_communication(comm)
Ejemplo n.º 3
0
def test_CommunicationWriterTGZ_single_file_ctx_mgr(output_file, login_info):
    comm = read_communication_from_file("tests/testdata/simple_1.concrete")
    with CommunicationWriterTGZ(output_file) as writer:
        writer.write(comm, "simple_1.concrete")

    assert tarfile.is_tarfile(output_file)

    f = tarfile.open(output_file)

    tarinfo = f.next()
    assert tarinfo is not None

    assert "simple_1.concrete" == tarinfo.name
    assert tarinfo.isreg()
    assert tarinfo.mtime > time.time() - TIME_MARGIN
    assert os.stat("tests/testdata/simple_1.concrete").st_size == tarinfo.size
    assert 0644 == tarinfo.mode
    assert login_info["uid"] == tarinfo.uid
    assert login_info["username"] == tarinfo.uname
    assert login_info["gid"] == tarinfo.gid
    assert login_info["groupname"] == tarinfo.gname

    tarinfo = f.next()
    assert tarinfo is None

    f.close()
Ejemplo n.º 4
0
def test_CommunicationWriterTGZ_single_file_default_name(output_file):
    comm = read_communication_from_file("tests/testdata/simple_1.concrete")
    writer = CommunicationWriterTGZ()
    writer.open(output_file)
    writer.write(comm)
    writer.close()

    assert tarfile.is_tarfile(output_file)

    f = tarfile.open(output_file)

    tarinfo = f.next()
    assert tarinfo is not None

    assert comm.uuid.uuidString + '.concrete' == tarinfo.name
    assert tarinfo.isreg()
    assert tarinfo.mtime > time.time() - TIME_MARGIN
    assert os.stat('tests/testdata/simple_1.concrete').st_size == tarinfo.size
    assert 0644 == tarinfo.mode
    assert os.getuid() == tarinfo.uid
    assert pwd.getpwuid(os.getuid()).pw_name == tarinfo.uname
    assert os.getgid() == tarinfo.gid
    assert grp.getgrgid(os.getgid()).gr_name == tarinfo.gname

    tarinfo = f.next()
    assert tarinfo is None

    f.close()
Ejemplo n.º 5
0
def test_CommunicationWriterTar_single_file_ctx_mgr(output_file):
    comm = read_communication_from_file("tests/testdata/simple_1.concrete")
    with CommunicationWriterTar(output_file) as writer:
        writer.write(comm, "simple_1.concrete")

    assert tarfile.is_tarfile(output_file)

    f = tarfile.open(output_file)

    tarinfo = f.next()
    assert tarinfo is not None

    assert "simple_1.concrete" == tarinfo.name
    assert tarinfo.isreg()
    assert tarinfo.mtime > time.time() - TIME_MARGIN
    assert os.stat('tests/testdata/simple_1.concrete').st_size == tarinfo.size
    assert 0644 == tarinfo.mode
    assert os.getuid() == tarinfo.uid
    assert pwd.getpwuid(os.getuid()).pw_name == tarinfo.uname
    assert os.getgid() == tarinfo.gid
    assert grp.getgrgid(os.getgid()).gr_name == tarinfo.gname

    tarinfo = f.next()
    assert tarinfo is None

    f.close()
Ejemplo n.º 6
0
def load_comm(filename,
              tool='ud converted ptb trees using pyStanfordDependencies'):
    "Load a concrete communication file with required pyStanfordDependencies output."
    # import here to avoid requiring concrete
    from concrete.util.file_io import read_communication_from_file
    comm = read_communication_from_file(filename)
    if comm.sectionList:
        for sec in comm.sectionList:
            if sec.sentenceList:
                for sent in sec.sentenceList:
                    yield sec.label, get_udparse(sent, tool)
Ejemplo n.º 7
0
def test_CommunicationWriter_fixed_point_unicode(output_file):
    input_file = "tests/testdata/les-deux-chandeliers.concrete"
    comm = read_communication_from_file(input_file)

    with CommunicationWriter(output_file) as writer:
        writer.write(comm)

    with open(input_file, "rb") as expected_f:
        expected_data = expected_f.read()
        with open(output_file, "rb") as actual_f:
            actual_data = actual_f.read()
            assert expected_data == actual_data
Ejemplo n.º 8
0
def test_CommunicationWriter_fixed_point_ctx_mgr(output_file):
    input_file = "tests/testdata/simple_1.concrete"
    comm = read_communication_from_file(input_file)

    with CommunicationWriter(output_file) as writer:
        writer.write(comm)

    with open(input_file, "rb") as expected_f:
        expected_data = expected_f.read()
        with open(output_file, "rb") as actual_f:
            actual_data = actual_f.read()
            assert expected_data == actual_data
Ejemplo n.º 9
0
def test_CommunicationWriter_fixed_point_unicode(output_file):
    input_file = 'tests/testdata/les-deux-chandeliers.concrete'
    comm = read_communication_from_file(input_file)

    with CommunicationWriter(output_file) as writer:
        writer.write(comm)

    with open(input_file, 'rb') as expected_f:
        expected_data = expected_f.read()
        with open(output_file, 'rb') as actual_f:
            actual_data = actual_f.read()
            assert expected_data == actual_data
Ejemplo n.º 10
0
def test_CommunicationWriter_fixed_point_ctx_mgr(output_file):
    input_file = 'tests/testdata/simple_1.concrete'
    comm = read_communication_from_file(input_file)

    with CommunicationWriter(output_file) as writer:
        writer.write(comm)

    with open(input_file, 'rb') as expected_f:
        expected_data = expected_f.read()
        with open(output_file, 'rb') as actual_f:
            actual_data = actual_f.read()
            assert expected_data == actual_data
Ejemplo n.º 11
0
def test_CommunicationWriter_fixed_point(output_file):
    input_file = "tests/testdata/simple_1.concrete"
    comm = read_communication_from_file(input_file)

    writer = CommunicationWriter()
    try:
        writer.open(output_file)
        writer.write(comm)
    finally:
        writer.close()

    with open(input_file, "rb") as expected_f:
        expected_data = expected_f.read()
        with open(output_file, "rb") as actual_f:
            actual_data = actual_f.read()
            assert expected_data == actual_data
Ejemplo n.º 12
0
def test_CommunicationWriterTar_single_file_fixed_point_unicode(output_file, login_info):
    comm = read_communication_from_file("tests/testdata/les-deux-chandeliers.concrete")
    with CommunicationWriterTar(output_file) as writer:
        writer.write(comm, "les-deux-chandeliers.concrete")

    assert tarfile.is_tarfile(output_file)

    f = tarfile.open(output_file)

    tarinfo = f.next()
    assert tarinfo is not None

    assert "les-deux-chandeliers.concrete" == tarinfo.name
    actual_data = f.extractfile(tarinfo).read()
    with open("tests/testdata/les-deux-chandeliers.concrete", "rb") as expected_f:
        expected_data = expected_f.read()
        assert expected_data == actual_data

    tarinfo = f.next()
    assert tarinfo is None

    f.close()
Ejemplo n.º 13
0
def test_CommunicationWriterTar_single_file_fixed_point(output_file):
    comm = read_communication_from_file("tests/testdata/simple_1.concrete")
    with CommunicationWriterTar(output_file) as writer:
        writer.write(comm, "simple_1.concrete")

    assert tarfile.is_tarfile(output_file)

    f = tarfile.open(output_file)

    tarinfo = f.next()
    assert tarinfo is not None

    assert "simple_1.concrete" == tarinfo.name
    actual_data = f.extractfile(tarinfo).read()
    with open('tests/testdata/simple_1.concrete', 'rb') as expected_f:
        expected_data = expected_f.read()
        assert expected_data == actual_data

    tarinfo = f.next()
    assert tarinfo is None

    f.close()
Ejemplo n.º 14
0
def validate_communication_file(communication_filename):
    logging.info(_ilm(
        0, "Opening Concrete Communication with filename '%s'"
           % communication_filename))
    comm = read_communication_from_file(communication_filename)
    validate_communication(comm)
Ejemplo n.º 15
0
import argparse
import csv
import itertools

parser = argparse.ArgumentParser()
parser.add_argument("--tsv", type=str, default="", help="")
parser.add_argument("--concrete_dir", type=str, default="", help="")
parser.add_argument("--lang", type=str, default="eng")
ARGS = parser.parse_args()

tsv = csv.reader(open(ARGS.tsv), delimiter='\t')

for comm_id, rows in itertools.groupby(tsv, key=lambda r: r[0].split(':')[0]):

    try:
        comm = cio.read_communication_from_file(
            f"{ARGS.concrete_dir}/{comm_id}.comm")

        # remove non-English documents
        lang_dist = comm.lidList[0].languageToProbabilityMap
        lang = max(lang_dist.items(), key=lambda t: t[1])[0]
        if ARGS.lang != "all" and lang != ARGS.lang:  # if ARGS.lang == "all", retain all language samples
            continue

        sentences = [
            sentence for section in comm.sectionList
            for sentence in section.sentenceList
        ]

        sentence_indices: np.ndarray = np.array(
            [sentence.textSpan.start for sentence in sentences])
        token_indices: List[np.ndarray] = [