Python CommunicationReader Examples, concrete.util.CommunicationReader Python Examples

Example #1

0

Show file

File: test_create_comm.py Project: ccmaymay/concrete-python

def test_create_comm_annotated(output_file, text):
    p = Popen([
        sys.executable,
        'scripts/create-comm.py',
        '--annotation-level', 'section',
        'tests/testdata/les-deux-chandeliers.txt',
        output_file
    ], stdout=PIPE, stderr=PIPE)
    (stdout, stderr) = p.communicate()
    assert p.returncode == 0

    reader = CommunicationReader(output_file)
    it = iter(reader)

    (comm, _) = next(it)
    assert comm.id == 'tests/testdata/les-deux-chandeliers.txt'
    assert validate_communication(comm)
    assert comm.text == text
    assert len(comm.sectionList) == 2

    try:
        next(it)
    except StopIteration:
        pass
    else:
        assert False

Example #2

0

Show file

File: test_create_comm.py Project: ccmaymay/concrete-python

def test_create_comm_stdout(output_file, text):
    p = Popen([
        sys.executable,
        'scripts/create-comm.py',
        'tests/testdata/les-deux-chandeliers.txt',
        '-'
    ], stdout=PIPE, stderr=PIPE)
    (stdout, stderr) = p.communicate()
    assert p.returncode == 0

    with open(output_file, 'wb') as f:
        f.write(stdout)

    reader = CommunicationReader(output_file)
    it = iter(reader)

    (comm, _) = next(it)
    assert comm.id == 'tests/testdata/les-deux-chandeliers.txt'
    assert validate_communication(comm)
    assert comm.text == text
    assert comm.sectionList is None

    try:
        next(it)
    except StopIteration:
        pass
    else:
        assert False

Example #3

0

Show file

def test_compress_uuids_api(reader_kwargs, compress_kwargs):
    input_file = 'tests/testdata/simple.tar.gz'
    reader = CommunicationReader(input_file, **reader_kwargs)
    it = iter(reader)

    (comm, _) = next(it)
    (new_comm, uc) = compress_uuids(comm, **compress_kwargs)
    assert new_comm.id == 'one'
    assert comm.id == new_comm.id
    assert validate_communication(new_comm)

    (comm, _) = next(it)
    (new_comm, uc) = compress_uuids(comm, **compress_kwargs)
    assert new_comm.id == 'two'
    assert comm.id == new_comm.id
    assert validate_communication(new_comm)

    (comm, _) = next(it)
    (new_comm, uc) = compress_uuids(comm, **compress_kwargs)
    assert new_comm.id == 'three'
    assert comm.id == new_comm.id
    assert validate_communication(new_comm)

    try:
        next(it)
    except StopIteration:
        pass
    else:
        assert False

Example #4

0

Show file

File: test_streams.py Project: hltcoe/concrete-python

def test_tar(fifo):
    input_path = 'tests/testdata/simple.tar'
    p = Process(target=write_fifo, args=(input_path, fifo))
    p.start()

    reader = CommunicationReader(fifo, filetype=FileType.TAR)
    it = iter(reader)

    (comm, path) = next(it)
    assert comm.id == 'one'

    (comm, path) = next(it)
    assert comm.id == 'two'

    (comm, path) = next(it)
    assert comm.id == 'three'

    try:
        next(it)
    except StopIteration:
        pass
    else:
        assert False

    p.join()

Example #5

0

Show file

def test_create_comm_tarball(output_file, text_l0, text_l1):
    p = Popen([
        sys.executable, 'scripts/create-comm-tarball.py',
        'tests/testdata/les-deux-chandeliers.tar.gz', output_file
    ],
              stdout=PIPE,
              stderr=PIPE)
    (stdout, stderr) = p.communicate()
    assert p.returncode == 0

    reader = CommunicationReader(output_file)
    it = iter(reader)

    (comm, _) = next(it)
    assert comm.id == 'les-deux-chandeliers/l0.txt'
    assert validate_communication(comm)
    assert comm.text == text_l0
    assert comm.sectionList is None

    (comm, _) = next(it)
    assert comm.id == 'les-deux-chandeliers/l1.txt'
    assert validate_communication(comm)
    assert comm.text == text_l1
    assert comm.sectionList is None

    try:
        next(it)
    except StopIteration:
        pass
    else:
        assert False

Example #6

0

Show file

def test_CommunicationReader_single_gz_file_no_add_references():
    filename = u'tests/testdata/simple_1.concrete.gz'
    reader = CommunicationReader(filename, add_references=False)
    (comm1, comm1_filename) = next(reader)
    assert not hasattr(comm1, 'sentenceForUUID')
    assert u'one' == comm1.id
    assert filename == comm1_filename

Example #7

0

Show file

def test_CommunicationReader_explicit_single_gz_file():
    filename = u'tests/testdata/simple_1.concrete.gz'
    reader = CommunicationReader(filename, filetype=FileType.STREAM_GZ)
    (comm1, comm1_filename) = next(reader)
    assert hasattr(comm1, 'sentenceForUUID')
    assert u'one' == comm1.id
    assert filename == comm1_filename

Example #8

0

Show file

def test_CommunicationReader_tar_gz_file_unicode():
    reader = CommunicationReader(
        "tests/testdata/les-deux-chandeliers.concrete.tar.gz")
    [comms, filenames] = zip(*[(c, f) for (c, f) in reader])
    assert len(comms) == 2
    assert 'les-deux-chandeliers/l0.txt' == comms[0].id
    assert 'les-deux-chandeliers/l1.txt' == comms[1].id

Example #9

0

Show file

def test_tweets2concrete_log_config(log_conf, output_file):
    (log_conf_path, log_path) = log_conf
    p = Popen([
        sys.executable, 'scripts/tweets2concrete.py', '--log-conf-path',
        log_conf_path, '--log-interval', '1', 'tests/testdata/tweets.json',
        output_file
    ],
              stdout=PIPE,
              stderr=PIPE)
    (stdout, stderr) = p.communicate()
    assert stdout.decode('utf-8') == ''
    assert stderr.decode('utf-8') == ''
    assert p.returncode == 0

    with open(log_path) as f:
        data = f.read()
        assert len(
            [line for line in data.strip().split('\n') if 'INFO' in line]) >= 2

    reader = CommunicationReader(output_file)
    it = iter(reader)

    (comm, _) = next(it)
    assert_first_comm(comm)

    (comm, _) = next(it)
    assert_second_comm(comm)

    try:
        next(it)
    except StopIteration:
        pass
    else:
        assert False

Example #10

0

Show file

def test_tweets2concrete(output_file):
    p = Popen([
        sys.executable, 'scripts/tweets2concrete.py',
        'tests/testdata/tweets.json', output_file
    ],
              stdout=PIPE,
              stderr=PIPE)
    (stdout, stderr) = p.communicate()
    assert p.returncode == 0

    reader = CommunicationReader(output_file)
    it = iter(reader)

    (comm, _) = next(it)
    assert_first_comm(comm)

    (comm, _) = next(it)
    assert_second_comm(comm)

    try:
        next(it)
    except StopIteration:
        pass
    else:
        assert False

Example #11

0

Show file

def test_tweets2concrete_log_every(output_file):
    p = Popen([
        sys.executable, 'scripts/tweets2concrete.py', '--log-level', 'INFO',
        '--log-interval', '1', 'tests/testdata/tweets.json', output_file
    ],
              stdout=PIPE,
              stderr=PIPE)
    (stdout, stderr) = p.communicate()
    assert p.returncode == 0
    assert len([
        line for line in stderr.decode('utf-8').strip().split('\n')
        if 'INFO' in line
    ]) >= 2

    reader = CommunicationReader(output_file)
    it = iter(reader)

    (comm, _) = next(it)
    assert_first_comm(comm)

    (comm, _) = next(it)
    assert_second_comm(comm)

    try:
        next(it)
    except StopIteration:
        pass
    else:
        assert False

Example #12

0

Show file

def main():
    parser = argparse.ArgumentParser(
        description=
        "Encode a Communication archive as a CSV file, where each row contains a "
        "TJSONProtocol encoded Communication",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        'comms_archive',
        help="A directory, TGZ file or Zip file of Communications")
    parser.add_argument(
        'csv_file',
        help="Output CSV file with TJSONProtocol encoded Communications")
    parser.add_argument('--column-name',
                        default='comm',
                        help="Name to use for CSV column header")
    args = parser.parse_args()

    csv_fh = open(args.csv_file, 'wb')

    fieldnames = [args.column_name]
    writer = unicodecsv.DictWriter(csv_fh,
                                   fieldnames,
                                   lineterminator='\n',
                                   quoting=unicodecsv.QUOTE_ALL)
    writer.writeheader()

    for (comm, filename) in CommunicationReader(args.comms_archive):
        json_communication = TSerialization.serialize(
            comm, TJSONProtocol.TJSONProtocolFactory()).decode('utf-8')
        writer.writerow({args.column_name: json_communication})

Example #13

0

Show file

def test_CommunicationReader_single_bz2_file():
    filename = u'tests/testdata/simple_1.concrete.bz2'
    reader = CommunicationReader(filename)
    (comm1, comm1_filename) = next(reader)
    assert hasattr(comm1, 'sentenceForUUID')
    assert u'one' == comm1.id
    assert filename == comm1_filename

Example #14

0

Show file

def main():
    parser = ArgumentParser(
        formatter_class=ArgumentDefaultsHelpFormatter,
        description="Interface with a Concrete Annotator service")
    parser.add_argument('host',
                        help="Hostname of annotator service to which to"
                        " connect.")
    parser.add_argument('port',
                        type=int,
                        help="Port of annotator service to which to connect.")
    parser.add_argument('--input',
                        default='-',
                        help="Input source to use. '-' for stdin; otherwise"
                        " takes a path to a file.")
    parser.add_argument('--output',
                        default='-',
                        help="Output source to use. '-' for stdout; otherwise"
                        " takes a path to a file.")
    concrete.version.add_argparse_argument(parser)
    args = parser.parse_args()

    # Won't work on Windows... but that use case is very unlikely
    input_path = '/dev/fd/0' if (args.input) == '-' else args.input
    output_path = '/dev/fd/1' if (args.output) == '-' else args.output

    reader = CommunicationReader(input_path)
    with AnnotatorClientWrapper(args.host, args.port) as client:
        with CommunicationWriter(output_path) as writer:
            for (comm, _) in reader:
                writer.write(client.annotate(comm))

Example #15

0

Show file

File: test_streams.py Project: hltcoe/concrete-python

def test_concatenated_bz2(fifo):
    '''
    Note: concatenated_gz does not work, complaining about a tell (seek).
    tar_gz does work because the r|gz mode in tarfile results in direct
    calls to zlib for decompression.  gzip (which wraps zlib and is used
    in CommunicationReader for non-tar gz files) is the culprit.
    '''

    input_path = 'tests/testdata/simple_concatenated.bz2'
    p = Process(target=write_fifo, args=(input_path, fifo))
    p.start()

    reader = CommunicationReader(fifo, filetype=FileType.STREAM_BZ2)
    it = iter(reader)

    (comm, path) = next(it)
    assert comm.id == 'one'

    (comm, path) = next(it)
    assert comm.id == 'two'

    (comm, path) = next(it)
    assert comm.id == 'three'

    try:
        next(it)
    except StopIteration:
        pass
    else:
        assert False

    p.join()

Example #16

0

Show file

def test_CommunicationReader_explicit_concatenated_bz2_file():
    filename = u'tests/testdata/simple_concatenated.bz2'
    reader = CommunicationReader(filename, filetype=FileType.STREAM_BZ2)
    [comms, filenames] = zip(*[(c, f) for (c, f) in reader])
    for (i, comm_id) in enumerate([u'one', u'two', u'three']):
        assert hasattr(comms[i], 'sentenceForUUID')
        assert comm_id == comms[i].id
        assert filename == filenames[i]

Example #17

0

Show file

def test_CommunicationReader_concatenated_gz_file():
    filename = u'tests/testdata/simple_concatenated.gz'
    reader = CommunicationReader(filename)
    [comms, filenames] = zip(*[(c, f) for (c, f) in reader])
    for (i, comm_id) in enumerate([u'one', u'two', u'three']):
        assert hasattr(comms[i], 'sentenceForUUID')
        assert comm_id == comms[i].id
        assert filename == filenames[i]

Example #18

0

Show file

def test_CommunicationReader_zip_file():
    reader = CommunicationReader("tests/testdata/simple.zip")
    [comms, filenames] = zip(*[(c, f) for (c, f) in reader])
    assert hasattr(comms[0], 'sentenceForUUID')
    assert hasattr(comms[1], 'sentenceForUUID')
    assert hasattr(comms[2], 'sentenceForUUID')
    assert u'one' == comms[0].id
    assert u'two' == comms[1].id
    assert u'three' == comms[2].id
    assert u'simple_1.concrete' == filenames[0]
    assert u'simple_2.concrete' == filenames[1]
    assert u'simple_3.concrete' == filenames[2]

Example #19

0

Show file

def main():
    set_stdout_encoding()

    parser = argparse.ArgumentParser(
        description='Read communications from file and write to an AWS S3 '
                    'bucket (keyed by communication id).',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument('input_path',
                        help='path to input communications (uncompressed, '
                             'gz, bz2, tar, zip, etc.) (if "-", read from '
                             'stdin)')
    parser.add_argument('bucket_name', help='name of S3 bucket to write to')
    parser.add_argument('--prefix-len', type=int, default=DEFAULT_S3_KEY_PREFIX_LEN,
                        help='S3 keys are prefixed with hashes of this length')
    parser.add_argument('-l', '--loglevel', '--log-level',
                        help='Logging verbosity level threshold (to stderr)',
                        default='info')
    concrete.version.add_argparse_argument(parser)
    args = parser.parse_args()

    logging.basicConfig(format='%(asctime)-15s %(levelname)s: %(message)s',
                        level=args.loglevel.upper())

    # if input_path is '-', read from stdin
    if args.input_path == '-':
        pairs = CommunicationReader('/dev/fd/0', filetype=FileType.STREAM)
    else:
        pairs = CommunicationReader(args.input_path)

    logging.info('connecting to s3')
    conn = connect_s3()
    logging.info('retrieving bucket {}'.format(args.bucket_name))
    bucket = conn.get_bucket(args.bucket_name)
    logging.info('reading from {}; writing to s3 bucket {}, prefix length {}'.format(
        args.input_path, args.bucket_name, args.prefix_len))
    handler = S3BackedStoreHandler(bucket, args.prefix_len)
    for (comm, _) in pairs:
        logging.info('storing {}'.format(comm.id))
        handler.store(comm)

Example #20

0

Show file

def test_CommunicationReader_tar_gz_file_no_add_references():
    reader = CommunicationReader("tests/testdata/simple.tar.gz",
                                 add_references=False)
    [comms, filenames] = zip(*[(c, f) for (c, f) in reader])
    assert not hasattr(comms[0], 'sentenceForUUID')
    assert not hasattr(comms[1], 'sentenceForUUID')
    assert not hasattr(comms[2], 'sentenceForUUID')
    assert u'one' == comms[0].id
    assert u'two' == comms[1].id
    assert u'three' == comms[2].id
    assert u'simple_1.concrete' == filenames[0]
    assert u'simple_2.concrete' == filenames[1]
    assert u'simple_3.concrete' == filenames[2]

Example #21

0

Show file

def test_CommunicationReader_explicit_nested_tar_file():
    reader = CommunicationReader("tests/testdata/simple_nested.tar",
                                 filetype=FileType.TAR)
    [comms, filenames] = zip(*[(c, f) for (c, f) in reader])
    assert hasattr(comms[0], 'sentenceForUUID')
    assert hasattr(comms[1], 'sentenceForUUID')
    assert hasattr(comms[2], 'sentenceForUUID')
    assert u'one' == comms[0].id
    assert u'two' == comms[1].id
    assert u'three' == comms[2].id
    assert u'a/b/simple_1.concrete' == filenames[0]
    assert u'a/c/simple_2.concrete' == filenames[1]
    assert u'a/c/simple_3.concrete' == filenames[2]

Example #22

0

Show file

File: inspect_dependency_parses.py Project: anandsahuja/concrete-python

def main():
    parser = argparse.ArgumentParser(
        description="Inspect empty dependency parses")
    parser.add_argument('communication_file')
    concrete.version.add_argparse_argument(parser)
    args = parser.parse_args()

    logging.basicConfig(format='%(levelname)7s:  %(message)s',
                        level=logging.INFO)

    for (comm, filename) in CommunicationReader(args.communication_file):
        logging.info(u"Inspecting Communication with ID '%s" % comm.id)
        for tokenization in concrete.inspect.get_tokenizations(comm):
            inspect_dependency_parses(tokenization)

Example #23

0

Show file

def test_compress_uuids(output_file, args):
    input_file = 'tests/testdata/simple.tar.gz'

    p = Popen(
        [sys.executable, 'scripts/compress-uuids.py', input_file, output_file
         ] + list(args),
        stdout=PIPE,
        stderr=PIPE)
    (stdout, stderr) = p.communicate()

    assert p.returncode == 0

    reader = CommunicationReader(output_file)
    it = iter(reader)

    (comm, comm_filename) = next(it)
    assert comm_filename == 'simple_1.concrete'
    assert comm.id == 'one'
    assert validate_communication(comm)

    (comm, comm_filename) = next(it)
    assert comm_filename == 'simple_2.concrete'
    assert comm.id == 'two'
    assert validate_communication(comm)

    (comm, comm_filename) = next(it)
    assert comm_filename == 'simple_3.concrete'
    assert comm.id == 'three'
    assert validate_communication(comm)

    assert os.stat(output_file).st_size < os.stat(input_file).st_size

    try:
        next(it)
    except StopIteration:
        pass
    else:
        assert False

Example #24

0

Show file

def test_create_comm_tarball_log_every(output_file, text_l0, text_l1):
    p = Popen([
        sys.executable, 'scripts/create-comm-tarball.py', '--log-level',
        'INFO', '--log-interval', '1',
        'tests/testdata/les-deux-chandeliers.tar.gz', output_file
    ],
              stdout=PIPE,
              stderr=PIPE)
    (stdout, stderr) = p.communicate()
    assert p.returncode == 0
    assert len([
        line for line in stderr.decode('utf-8').strip().split('\n')
        if 'INFO' in line
    ]) >= 2

    reader = CommunicationReader(output_file)
    it = iter(reader)

    (comm, _) = next(it)
    assert comm.id == 'les-deux-chandeliers/l0.txt'
    assert validate_communication(comm)
    assert comm.text == text_l0
    assert comm.sectionList is None

    (comm, _) = next(it)
    assert comm.id == 'les-deux-chandeliers/l1.txt'
    assert validate_communication(comm)
    assert comm.text == text_l1
    assert comm.sectionList is None

    try:
        next(it)
    except StopIteration:
        pass
    else:
        assert False

Example #25

0

Show file

File: test_file_io.py Project: hltcoe/concrete-python

def test_CommunicationReader_truncated_tgz_file():
    reader = CommunicationReader('tests/testdata/simple_1_and_truncated.tar.gz')
    (simple_comm, _) = reader.next()
    with raises(EOFError):
        (truncated_comm, _) = reader.next()

Example #26

0

Show file

def preprocess(tar_path, output_path):
    '''
    tar_path  -- tar file to process
    output_path -- directory of the output file
                   each line of the output file has the format {'Headline': string, 'Text': string}
    '''

    fname = "%s.txt" % tar_path.split('/')[-1].split('.')[0]
    out_fname = os.path.join(output_path, fname)

    mem = {}

    with open(out_fname, 'w') as f:
        for (comm, filename) in CommunicationReader(tar_path):
            text = comm.text
            headline_start = text.find("<HEADLINE>")
            headline_end = text.find('</HEADLINE>', headline_start)
            par1_start = text.find("<P>", headline_end)
            par1_end = text.find("</P>", par1_start)
            headline = text[headline_start +
                            len('<HEADLINE>'):headline_end].strip()
            par1 = text[par1_start + len("<P>"):par1_end].strip()
            if headline in mem.keys():
                continue
            else:
                mem[headline] = par1

            # print(headline)
            # print(par1)

            #process healline
            if comm.id.startswith("XIN"):
                #for xinhua headline, remove anything before : or anything after :
                #Example sentences that need to be modified:
                #Roundup: Gulf Arab markets end on a mixed note
                #Israelis more distrustful of gov't institutions: survey
                a = headline.find(":")
                if a != -1:
                    b = headline.rfind(":")
                    if a == b:
                        if a < len(headline) / 2:
                            headline = headline[a + 1:]
                        else:
                            headline = headline[:b]
                    else:
                        headline = headline[a + 1:b]
            headline_token = word_tokenize(headline)
            #remove punctuations, replace number with #
            headline_token = [
                t.strip(string.punctuation).lower() for t in headline_token
            ]
            # headline_token = [re.sub(r"\d+(\W\d+)*", "#", t) for t in headline_token if t != ""]
            #ignore if headline is too short
            if len(headline_token) < 3:
                continue

            #process the first paragraph
            par1_token = word_tokenize(par1)
            #remove punctuations, replace number with #
            par1_token = [
                t.strip(string.punctuation).lower() for t in par1_token
            ]
            # par1_token = [re.sub(r"\d+(\W\d+)*", "#", t) for t in par1_token if t != ""]

            headline = " ".join([t for t in headline_token])
            par1 = " ".join([t for t in par1_token])
            obj = {'Headline': headline, "Text": par1}
            json_str = json.dumps(obj)
            f.write(json_str + '\n')
    print("completed file %s" % fname)
    return fname

Example #27

0

Show file

def test_CommunicationReader_truncated_tgz_file():
    reader = CommunicationReader(
        'tests/testdata/simple_1_and_truncated.tar.gz')
    (simple_comm, _) = reader.next()
    with raises(EOFError):
        (truncated_comm, _) = reader.next()

Example #28

0

Show file

def test_CommunicationReader_truncated_gz_file():
    reader = CommunicationReader('tests/testdata/truncated.comm.gz')
    with raises(EOFError):
        [comms, filenames] = zip(*[(c, f) for (c, f) in reader])

Example #29

0

Show file

def main():
    set_stdout_encoding()

    parser = ArgumentParser(
        formatter_class=ArgumentDefaultsHelpFormatter,
        description=
        "Interface with a Concrete AnnotateCommunicationService server. "
        "Supports either THttp/TJSONProtocol (using the '--uri' flag) "
        "or TSocket/TCompactProtocol (using '--host'/'--port')")
    parser.add_argument(
        '--host',
        default='localhost',
        help="Hostname of TSocket/TCompactProtocol AnnotateCommunicationService"
    )
    parser.add_argument(
        '-p',
        '--port',
        type=int,
        default=9090,
        help="Port of TSocket/TCompactProtocol AnnotateCommunicationService")
    parser.add_argument(
        '--uri',
        '--url',
        help="URI of THttpServer/TJSONProtocol AnnotateCommunicationService")
    parser.add_argument('-l',
                        '--loglevel',
                        '--log-level',
                        help='Logging verbosity level threshold (to stderr)',
                        default='info')
    parser.add_argument('--input',
                        default='-',
                        help="Input source to use. '-' for stdin; otherwise"
                        " takes a path to a file.")
    parser.add_argument('--output',
                        default='-',
                        help="Output source to use. '-' for stdout; otherwise"
                        " takes a path to a file.")
    concrete.version.add_argparse_argument(parser)
    args = parser.parse_args()

    logging.basicConfig(format='%(asctime)-15s %(levelname)s: %(message)s',
                        level=args.loglevel.upper())

    # Won't work on Windows
    if args.input == '-':
        reader_kwargs = dict(filetype=FileType.STREAM)
        input_path = '/dev/fd/0'
    else:
        reader_kwargs = dict()
        input_path = args.input
    output_path = '/dev/fd/1' if args.output == '-' else args.output

    reader = CommunicationReader(input_path, **reader_kwargs)
    if args.uri:
        try:
            with HTTPAnnotateCommunicationClientWrapper(args.uri) as client:
                with CommunicationWriter(output_path) as writer:
                    for (comm, _) in reader:
                        writer.write(client.annotate(comm))
        except TProtocolException as ex:
            logging.error(ex)
            logging.error(
                "Successfully connected to the URI '{}' using HTTP, but the URI does not "
                "appear to be an AnnotateCommunicationService endpoint that uses the "
                "Thrift THttp transport and TJSONProtocol encoding".format(
                    args.uri))
    else:
        try:
            with AnnotateCommunicationClientWrapper(args.host,
                                                    args.port) as client:
                with CommunicationWriter(output_path) as writer:
                    for (comm, _) in reader:
                        writer.write(client.annotate(comm))
        except TTransportException:
            pass

Example #30

0

Show file

def test_CommunicationReader_single_file_unicode():
    reader = CommunicationReader(
        "tests/testdata/les-deux-chandeliers.concrete")
    [comms, filenames] = zip(*[(c, f) for (c, f) in reader])
    assert len(comms) == 1
    assert 'tests/testdata/les-deux-chandeliers.txt' == comms[0].id

Example #31

0

Show file

def main():
    # Make stdout output UTF-8, preventing "'ascii' codec can't encode" errors
    sys.stdout = codecs.getwriter('utf8')(sys.stdout)

    parser = argparse.ArgumentParser(
        description="Print information about a Concrete Communication to"
        " stdout.  If communication_filename is specified, read"
        " communication from file; otherwise, read from standard"
        " input.", )
    parser.add_argument('--count',
                        type=int,
                        help='Print at most this many communications.')
    parser.add_argument('--annotation-headers',
                        action='store_true',
                        help='Print annotation type headers.')
    parser.add_argument("--char-offsets",
                        help="Print token text extracted from character offset"
                        "s (not the text stored in the tokenization) in '"
                        "ConLL-style' format",
                        action="store_true")
    parser.add_argument("--dependency",
                        help="Print HEAD and DEPREL tags for first dependency "
                        "parse in 'ConLL-style' format",
                        action="store_true")
    parser.add_argument("--dependency-tool",
                        type=str,
                        help='Filter --dependency output to specified '
                        'tool (requires --dependency)')
    parser.add_argument("--entities",
                        help="Print info about all Entities and their EntityMe"
                        "ntions",
                        action="store_true")
    parser.add_argument("--entities-tool",
                        type=str,
                        help='Filter --entities output to specified '
                        'tool (requires --entities)')
    parser.add_argument("--lemmas",
                        help="Print first set of lemma token tags in 'ConLL-st"
                        "yle' format",
                        action="store_true")
    parser.add_argument("--lemmas-tool",
                        type=str,
                        help='Filter --lemmas output to specified '
                        'tool (requires --lemmas)')
    parser.add_argument("--metadata",
                        help="Print metadata for tools used to annotate Commun"
                        "ication",
                        action="store_true")
    parser.add_argument("--metadata-tool",
                        type=str,
                        help='Filter --metadata output to specified '
                        'tool (requires --metadata)')
    parser.add_argument("--communication-taggings",
                        help="Print communication taggings",
                        action="store_true")
    parser.add_argument("--communication-taggings-tool",
                        type=str,
                        help='Filter --communication-taggings output to '
                        'specified tool (requires '
                        '--communication-taggings)')
    parser.add_argument("--mentions",
                        help="Print whitespace-separated tokens, with entity m"
                        "entions wrapped using <ENTITY ID=x> tags, where "
                        "'x' is the (zero-indexed) entity number",
                        action="store_true")
    parser.add_argument("--mentions-tool",
                        type=str,
                        help='Filter --mentions output to specified '
                        'tool (requires --mentions)')
    parser.add_argument("--ner",
                        help="Print first set of Named Entity Recognition toke"
                        "n tags in 'ConLL-style' format",
                        action="store_true")
    parser.add_argument("--ner-tool",
                        type=str,
                        help='Filter --ner output to specified '
                        'tool (requires --ner)')
    parser.add_argument("--pos",
                        help="Print first set of Part-Of-Speech token tags in "
                        "'ConLL-style' format",
                        action="store_true")
    parser.add_argument("--pos-tool",
                        type=str,
                        help='Filter --pos output to specified '
                        'tool (requires --pos)')
    parser.add_argument("--sections",
                        action='store_true',
                        help="Print text according to Section offsets"
                        "(textSpan values). These textSpans are assumed "
                        "to be valid.")
    parser.add_argument("--sections-tool",
                        type=str,
                        help='Filter --sections output to specified '
                        'tool (requires --sections)')
    parser.add_argument("--situation-mentions",
                        help="Print info about all SituationMentions",
                        action="store_true")
    parser.add_argument("--situation-mentions-tool",
                        type=str,
                        help='Filter --situation-mentions output to specified '
                        'tool (requires --situation-mentions)')
    parser.add_argument("--situations",
                        help="Print info about all Situations and their Situat"
                        "ionMentions",
                        action="store_true")
    parser.add_argument("--situations-tool",
                        type=str,
                        help='Filter --situations output to specified '
                        'tool (requires --situations)')
    parser.add_argument("--text",
                        help="Print .text field",
                        action="store_true")
    parser.add_argument("--text-tool",
                        type=str,
                        help='Filter --text output to specified '
                        'tool (requires --text)')
    parser.add_argument("--tokens",
                        help="Print whitespace-seperated tokens for *all* Toke"
                        "nizations in a Communication.  Each sentence tok"
                        "enization is printed on a separate line, and "
                        "empty lines indicate a section break",
                        action="store_true")
    parser.add_argument("--tokens-tool",
                        type=str,
                        help='Filter --tokens output to specified '
                        'tool (requires --tokens)')
    parser.add_argument("--treebank",
                        help="Print Penn-Treebank style parse trees for *all* "
                        "Constituent Parses in the Communication",
                        action="store_true")
    parser.add_argument("--treebank-tool",
                        type=str,
                        help='Filter --treebank output to specified '
                        'tool (requires --treebank)')
    parser.add_argument("--id",
                        help='Print communication id',
                        action='store_true')
    parser.add_argument("--id-tool",
                        type=str,
                        help='Filter --id output to specified '
                        'tool (requires --id)')
    parser.add_argument("--no-references",
                        help="Don't add references to communication (may preve"
                        "nt 'NoneType' errors)",
                        action="store_true")
    parser.add_argument('communication_filename',
                        nargs='?',
                        type=str,
                        help='Path to a Concrete Communication from which '
                        'to display information. If not specified, read '
                        'read from standard input')
    concrete.version.add_argparse_argument(parser)
    args = parser.parse_args()

    add_references = not args.no_references

    if args.communication_filename is not None:
        comms = CommunicationReader(args.communication_filename,
                                    add_references=add_references)
    else:
        comms = CommunicationReader('/dev/fd/0',
                                    add_references=add_references,
                                    filetype=FileType.STREAM)

    if not (args.char_offsets or args.dependency or args.lemmas or args.ner
            or args.pos or args.entities or args.mentions or args.metadata
            or args.sections or args.situation_mentions or args.situations
            or args.text or args.tokens or args.treebank or args.id
            or args.communication_taggings):
        parser.print_help()
        sys.exit(1)

    if ((args.dependency_tool and not args.dependency)
            or (args.lemmas_tool and not args.lemmas) or
        (args.ner_tool and not args.ner) or (args.pos_tool and not args.pos)
            or (args.entities_tool and not args.entities)
            or (args.mentions_tool and not args.mentions)
            or (args.metadata_tool and not args.metadata)
            or (args.sections_tool and not args.sections)
            or (args.situation_mentions_tool and not args.situation_mentions)
            or (args.situations_tool and not args.situations)
            or (args.text_tool and not args.text)
            or (args.tokens_tool and not args.tokens)
            or (args.treebank_tool and not args.treebank)
            or (args.id_tool and not args.id)
            or (args.communication_taggings_tool
                and not args.communication_taggings)):
        parser.print_help()
        sys.exit(1)

    comm_num = 0

    for (comm, _) in comms:
        if args.count is not None and comm_num == args.count:
            break

        if args.id:
            print_header_if('id', args.annotation_headers)
            concrete.inspect.print_id_for_communication(comm,
                                                        tool=args.id_tool)
        if args.text:
            print_header_if('text', args.annotation_headers)
            concrete.inspect.print_text_for_communication(comm,
                                                          tool=args.text_tool)
        if args.sections:
            print_header_if('sections', args.annotation_headers)
            concrete.inspect.print_sections(comm, tool=args.sections_tool)
        if args.tokens:
            print_header_if('tokens', args.annotation_headers)
            concrete.inspect.print_tokens_for_communication(
                comm, tool=args.tokens_tool)
        if args.treebank:
            print_header_if('treebank', args.annotation_headers)
            concrete.inspect.print_penn_treebank_for_communication(
                comm, tool=args.treebank_tool)
        if (args.char_offsets or args.dependency or args.lemmas or args.ner
                or args.pos):
            print_header_if('conll', args.annotation_headers)
            concrete.inspect.print_conll_style_tags_for_communication(
                comm,
                char_offsets=args.char_offsets,
                dependency=args.dependency,
                lemmas=args.lemmas,
                ner=args.ner,
                pos=args.pos,
                dependency_tool=args.dependency_tool,
                lemmas_tool=args.lemmas_tool,
                pos_tool=args.pos_tool,
                ner_tool=args.ner_tool)
        if args.entities:
            print_header_if('entities', args.annotation_headers)
            concrete.inspect.print_entities(comm, tool=args.entities_tool)
        if args.mentions:
            print_header_if('mentions', args.annotation_headers)
            concrete.inspect.print_tokens_with_entityMentions(
                comm, tool=args.mentions_tool)
        if args.situations:
            print_header_if('situations', args.annotation_headers)
            concrete.inspect.print_situations(comm, tool=args.situations_tool)
        if args.situation_mentions:
            print_header_if('situation mentions', args.annotation_headers)
            concrete.inspect.print_situation_mentions(
                comm, tool=args.situation_mentions_tool)
        if args.communication_taggings:
            print_header_if('communication taggings', args.annotation_headers)
            concrete.inspect.print_communication_taggings_for_communication(
                comm, tool=args.communication_taggings_tool)
        if args.metadata:
            print_header_if('metadata', args.annotation_headers)
            concrete.inspect.print_metadata(comm, tool=args.metadata_tool)

        comm_num += 1