def test_create_comm_annotated(output_file, text): p = Popen([ sys.executable, 'scripts/create-comm.py', '--annotation-level', 'section', 'tests/testdata/les-deux-chandeliers.txt', output_file ], stdout=PIPE, stderr=PIPE) (stdout, stderr) = p.communicate() assert p.returncode == 0 reader = CommunicationReader(output_file) it = iter(reader) (comm, _) = next(it) assert comm.id == 'tests/testdata/les-deux-chandeliers.txt' assert validate_communication(comm) assert comm.text == text assert len(comm.sectionList) == 2 try: next(it) except StopIteration: pass else: assert False
def test_create_comm_stdout(output_file, text): p = Popen([ sys.executable, 'scripts/create-comm.py', 'tests/testdata/les-deux-chandeliers.txt', '-' ], stdout=PIPE, stderr=PIPE) (stdout, stderr) = p.communicate() assert p.returncode == 0 with open(output_file, 'wb') as f: f.write(stdout) reader = CommunicationReader(output_file) it = iter(reader) (comm, _) = next(it) assert comm.id == 'tests/testdata/les-deux-chandeliers.txt' assert validate_communication(comm) assert comm.text == text assert comm.sectionList is None try: next(it) except StopIteration: pass else: assert False
def test_compress_uuids_api(reader_kwargs, compress_kwargs): input_file = 'tests/testdata/simple.tar.gz' reader = CommunicationReader(input_file, **reader_kwargs) it = iter(reader) (comm, _) = next(it) (new_comm, uc) = compress_uuids(comm, **compress_kwargs) assert new_comm.id == 'one' assert comm.id == new_comm.id assert validate_communication(new_comm) (comm, _) = next(it) (new_comm, uc) = compress_uuids(comm, **compress_kwargs) assert new_comm.id == 'two' assert comm.id == new_comm.id assert validate_communication(new_comm) (comm, _) = next(it) (new_comm, uc) = compress_uuids(comm, **compress_kwargs) assert new_comm.id == 'three' assert comm.id == new_comm.id assert validate_communication(new_comm) try: next(it) except StopIteration: pass else: assert False
def test_tar(fifo): input_path = 'tests/testdata/simple.tar' p = Process(target=write_fifo, args=(input_path, fifo)) p.start() reader = CommunicationReader(fifo, filetype=FileType.TAR) it = iter(reader) (comm, path) = next(it) assert comm.id == 'one' (comm, path) = next(it) assert comm.id == 'two' (comm, path) = next(it) assert comm.id == 'three' try: next(it) except StopIteration: pass else: assert False p.join()
def test_create_comm_tarball(output_file, text_l0, text_l1): p = Popen([ sys.executable, 'scripts/create-comm-tarball.py', 'tests/testdata/les-deux-chandeliers.tar.gz', output_file ], stdout=PIPE, stderr=PIPE) (stdout, stderr) = p.communicate() assert p.returncode == 0 reader = CommunicationReader(output_file) it = iter(reader) (comm, _) = next(it) assert comm.id == 'les-deux-chandeliers/l0.txt' assert validate_communication(comm) assert comm.text == text_l0 assert comm.sectionList is None (comm, _) = next(it) assert comm.id == 'les-deux-chandeliers/l1.txt' assert validate_communication(comm) assert comm.text == text_l1 assert comm.sectionList is None try: next(it) except StopIteration: pass else: assert False
def test_CommunicationReader_single_gz_file_no_add_references(): filename = u'tests/testdata/simple_1.concrete.gz' reader = CommunicationReader(filename, add_references=False) (comm1, comm1_filename) = next(reader) assert not hasattr(comm1, 'sentenceForUUID') assert u'one' == comm1.id assert filename == comm1_filename
def test_CommunicationReader_explicit_single_gz_file(): filename = u'tests/testdata/simple_1.concrete.gz' reader = CommunicationReader(filename, filetype=FileType.STREAM_GZ) (comm1, comm1_filename) = next(reader) assert hasattr(comm1, 'sentenceForUUID') assert u'one' == comm1.id assert filename == comm1_filename
def test_CommunicationReader_tar_gz_file_unicode(): reader = CommunicationReader( "tests/testdata/les-deux-chandeliers.concrete.tar.gz") [comms, filenames] = zip(*[(c, f) for (c, f) in reader]) assert len(comms) == 2 assert 'les-deux-chandeliers/l0.txt' == comms[0].id assert 'les-deux-chandeliers/l1.txt' == comms[1].id
def test_tweets2concrete_log_config(log_conf, output_file): (log_conf_path, log_path) = log_conf p = Popen([ sys.executable, 'scripts/tweets2concrete.py', '--log-conf-path', log_conf_path, '--log-interval', '1', 'tests/testdata/tweets.json', output_file ], stdout=PIPE, stderr=PIPE) (stdout, stderr) = p.communicate() assert stdout.decode('utf-8') == '' assert stderr.decode('utf-8') == '' assert p.returncode == 0 with open(log_path) as f: data = f.read() assert len( [line for line in data.strip().split('\n') if 'INFO' in line]) >= 2 reader = CommunicationReader(output_file) it = iter(reader) (comm, _) = next(it) assert_first_comm(comm) (comm, _) = next(it) assert_second_comm(comm) try: next(it) except StopIteration: pass else: assert False
def test_tweets2concrete(output_file): p = Popen([ sys.executable, 'scripts/tweets2concrete.py', 'tests/testdata/tweets.json', output_file ], stdout=PIPE, stderr=PIPE) (stdout, stderr) = p.communicate() assert p.returncode == 0 reader = CommunicationReader(output_file) it = iter(reader) (comm, _) = next(it) assert_first_comm(comm) (comm, _) = next(it) assert_second_comm(comm) try: next(it) except StopIteration: pass else: assert False
def test_tweets2concrete_log_every(output_file): p = Popen([ sys.executable, 'scripts/tweets2concrete.py', '--log-level', 'INFO', '--log-interval', '1', 'tests/testdata/tweets.json', output_file ], stdout=PIPE, stderr=PIPE) (stdout, stderr) = p.communicate() assert p.returncode == 0 assert len([ line for line in stderr.decode('utf-8').strip().split('\n') if 'INFO' in line ]) >= 2 reader = CommunicationReader(output_file) it = iter(reader) (comm, _) = next(it) assert_first_comm(comm) (comm, _) = next(it) assert_second_comm(comm) try: next(it) except StopIteration: pass else: assert False
def main(): parser = argparse.ArgumentParser( description= "Encode a Communication archive as a CSV file, where each row contains a " "TJSONProtocol encoded Communication", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( 'comms_archive', help="A directory, TGZ file or Zip file of Communications") parser.add_argument( 'csv_file', help="Output CSV file with TJSONProtocol encoded Communications") parser.add_argument('--column-name', default='comm', help="Name to use for CSV column header") args = parser.parse_args() csv_fh = open(args.csv_file, 'wb') fieldnames = [args.column_name] writer = unicodecsv.DictWriter(csv_fh, fieldnames, lineterminator='\n', quoting=unicodecsv.QUOTE_ALL) writer.writeheader() for (comm, filename) in CommunicationReader(args.comms_archive): json_communication = TSerialization.serialize( comm, TJSONProtocol.TJSONProtocolFactory()).decode('utf-8') writer.writerow({args.column_name: json_communication})
def test_CommunicationReader_single_bz2_file(): filename = u'tests/testdata/simple_1.concrete.bz2' reader = CommunicationReader(filename) (comm1, comm1_filename) = next(reader) assert hasattr(comm1, 'sentenceForUUID') assert u'one' == comm1.id assert filename == comm1_filename
def main(): parser = ArgumentParser( formatter_class=ArgumentDefaultsHelpFormatter, description="Interface with a Concrete Annotator service") parser.add_argument('host', help="Hostname of annotator service to which to" " connect.") parser.add_argument('port', type=int, help="Port of annotator service to which to connect.") parser.add_argument('--input', default='-', help="Input source to use. '-' for stdin; otherwise" " takes a path to a file.") parser.add_argument('--output', default='-', help="Output source to use. '-' for stdout; otherwise" " takes a path to a file.") concrete.version.add_argparse_argument(parser) args = parser.parse_args() # Won't work on Windows... but that use case is very unlikely input_path = '/dev/fd/0' if (args.input) == '-' else args.input output_path = '/dev/fd/1' if (args.output) == '-' else args.output reader = CommunicationReader(input_path) with AnnotatorClientWrapper(args.host, args.port) as client: with CommunicationWriter(output_path) as writer: for (comm, _) in reader: writer.write(client.annotate(comm))
def test_concatenated_bz2(fifo): ''' Note: concatenated_gz does not work, complaining about a tell (seek). tar_gz does work because the r|gz mode in tarfile results in direct calls to zlib for decompression. gzip (which wraps zlib and is used in CommunicationReader for non-tar gz files) is the culprit. ''' input_path = 'tests/testdata/simple_concatenated.bz2' p = Process(target=write_fifo, args=(input_path, fifo)) p.start() reader = CommunicationReader(fifo, filetype=FileType.STREAM_BZ2) it = iter(reader) (comm, path) = next(it) assert comm.id == 'one' (comm, path) = next(it) assert comm.id == 'two' (comm, path) = next(it) assert comm.id == 'three' try: next(it) except StopIteration: pass else: assert False p.join()
def test_CommunicationReader_explicit_concatenated_bz2_file(): filename = u'tests/testdata/simple_concatenated.bz2' reader = CommunicationReader(filename, filetype=FileType.STREAM_BZ2) [comms, filenames] = zip(*[(c, f) for (c, f) in reader]) for (i, comm_id) in enumerate([u'one', u'two', u'three']): assert hasattr(comms[i], 'sentenceForUUID') assert comm_id == comms[i].id assert filename == filenames[i]
def test_CommunicationReader_concatenated_gz_file(): filename = u'tests/testdata/simple_concatenated.gz' reader = CommunicationReader(filename) [comms, filenames] = zip(*[(c, f) for (c, f) in reader]) for (i, comm_id) in enumerate([u'one', u'two', u'three']): assert hasattr(comms[i], 'sentenceForUUID') assert comm_id == comms[i].id assert filename == filenames[i]
def test_CommunicationReader_zip_file(): reader = CommunicationReader("tests/testdata/simple.zip") [comms, filenames] = zip(*[(c, f) for (c, f) in reader]) assert hasattr(comms[0], 'sentenceForUUID') assert hasattr(comms[1], 'sentenceForUUID') assert hasattr(comms[2], 'sentenceForUUID') assert u'one' == comms[0].id assert u'two' == comms[1].id assert u'three' == comms[2].id assert u'simple_1.concrete' == filenames[0] assert u'simple_2.concrete' == filenames[1] assert u'simple_3.concrete' == filenames[2]
def main(): set_stdout_encoding() parser = argparse.ArgumentParser( description='Read communications from file and write to an AWS S3 ' 'bucket (keyed by communication id).', formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument('input_path', help='path to input communications (uncompressed, ' 'gz, bz2, tar, zip, etc.) (if "-", read from ' 'stdin)') parser.add_argument('bucket_name', help='name of S3 bucket to write to') parser.add_argument('--prefix-len', type=int, default=DEFAULT_S3_KEY_PREFIX_LEN, help='S3 keys are prefixed with hashes of this length') parser.add_argument('-l', '--loglevel', '--log-level', help='Logging verbosity level threshold (to stderr)', default='info') concrete.version.add_argparse_argument(parser) args = parser.parse_args() logging.basicConfig(format='%(asctime)-15s %(levelname)s: %(message)s', level=args.loglevel.upper()) # if input_path is '-', read from stdin if args.input_path == '-': pairs = CommunicationReader('/dev/fd/0', filetype=FileType.STREAM) else: pairs = CommunicationReader(args.input_path) logging.info('connecting to s3') conn = connect_s3() logging.info('retrieving bucket {}'.format(args.bucket_name)) bucket = conn.get_bucket(args.bucket_name) logging.info('reading from {}; writing to s3 bucket {}, prefix length {}'.format( args.input_path, args.bucket_name, args.prefix_len)) handler = S3BackedStoreHandler(bucket, args.prefix_len) for (comm, _) in pairs: logging.info('storing {}'.format(comm.id)) handler.store(comm)
def test_CommunicationReader_tar_gz_file_no_add_references(): reader = CommunicationReader("tests/testdata/simple.tar.gz", add_references=False) [comms, filenames] = zip(*[(c, f) for (c, f) in reader]) assert not hasattr(comms[0], 'sentenceForUUID') assert not hasattr(comms[1], 'sentenceForUUID') assert not hasattr(comms[2], 'sentenceForUUID') assert u'one' == comms[0].id assert u'two' == comms[1].id assert u'three' == comms[2].id assert u'simple_1.concrete' == filenames[0] assert u'simple_2.concrete' == filenames[1] assert u'simple_3.concrete' == filenames[2]
def test_CommunicationReader_explicit_nested_tar_file(): reader = CommunicationReader("tests/testdata/simple_nested.tar", filetype=FileType.TAR) [comms, filenames] = zip(*[(c, f) for (c, f) in reader]) assert hasattr(comms[0], 'sentenceForUUID') assert hasattr(comms[1], 'sentenceForUUID') assert hasattr(comms[2], 'sentenceForUUID') assert u'one' == comms[0].id assert u'two' == comms[1].id assert u'three' == comms[2].id assert u'a/b/simple_1.concrete' == filenames[0] assert u'a/c/simple_2.concrete' == filenames[1] assert u'a/c/simple_3.concrete' == filenames[2]
def main(): parser = argparse.ArgumentParser( description="Inspect empty dependency parses") parser.add_argument('communication_file') concrete.version.add_argparse_argument(parser) args = parser.parse_args() logging.basicConfig(format='%(levelname)7s: %(message)s', level=logging.INFO) for (comm, filename) in CommunicationReader(args.communication_file): logging.info(u"Inspecting Communication with ID '%s" % comm.id) for tokenization in concrete.inspect.get_tokenizations(comm): inspect_dependency_parses(tokenization)
def test_compress_uuids(output_file, args): input_file = 'tests/testdata/simple.tar.gz' p = Popen( [sys.executable, 'scripts/compress-uuids.py', input_file, output_file ] + list(args), stdout=PIPE, stderr=PIPE) (stdout, stderr) = p.communicate() assert p.returncode == 0 reader = CommunicationReader(output_file) it = iter(reader) (comm, comm_filename) = next(it) assert comm_filename == 'simple_1.concrete' assert comm.id == 'one' assert validate_communication(comm) (comm, comm_filename) = next(it) assert comm_filename == 'simple_2.concrete' assert comm.id == 'two' assert validate_communication(comm) (comm, comm_filename) = next(it) assert comm_filename == 'simple_3.concrete' assert comm.id == 'three' assert validate_communication(comm) assert os.stat(output_file).st_size < os.stat(input_file).st_size try: next(it) except StopIteration: pass else: assert False
def test_create_comm_tarball_log_every(output_file, text_l0, text_l1): p = Popen([ sys.executable, 'scripts/create-comm-tarball.py', '--log-level', 'INFO', '--log-interval', '1', 'tests/testdata/les-deux-chandeliers.tar.gz', output_file ], stdout=PIPE, stderr=PIPE) (stdout, stderr) = p.communicate() assert p.returncode == 0 assert len([ line for line in stderr.decode('utf-8').strip().split('\n') if 'INFO' in line ]) >= 2 reader = CommunicationReader(output_file) it = iter(reader) (comm, _) = next(it) assert comm.id == 'les-deux-chandeliers/l0.txt' assert validate_communication(comm) assert comm.text == text_l0 assert comm.sectionList is None (comm, _) = next(it) assert comm.id == 'les-deux-chandeliers/l1.txt' assert validate_communication(comm) assert comm.text == text_l1 assert comm.sectionList is None try: next(it) except StopIteration: pass else: assert False
def test_CommunicationReader_truncated_tgz_file(): reader = CommunicationReader('tests/testdata/simple_1_and_truncated.tar.gz') (simple_comm, _) = reader.next() with raises(EOFError): (truncated_comm, _) = reader.next()
def preprocess(tar_path, output_path): ''' tar_path -- tar file to process output_path -- directory of the output file each line of the output file has the format {'Headline': string, 'Text': string} ''' fname = "%s.txt" % tar_path.split('/')[-1].split('.')[0] out_fname = os.path.join(output_path, fname) mem = {} with open(out_fname, 'w') as f: for (comm, filename) in CommunicationReader(tar_path): text = comm.text headline_start = text.find("<HEADLINE>") headline_end = text.find('</HEADLINE>', headline_start) par1_start = text.find("<P>", headline_end) par1_end = text.find("</P>", par1_start) headline = text[headline_start + len('<HEADLINE>'):headline_end].strip() par1 = text[par1_start + len("<P>"):par1_end].strip() if headline in mem.keys(): continue else: mem[headline] = par1 # print(headline) # print(par1) #process healline if comm.id.startswith("XIN"): #for xinhua headline, remove anything before : or anything after : #Example sentences that need to be modified: #Roundup: Gulf Arab markets end on a mixed note #Israelis more distrustful of gov't institutions: survey a = headline.find(":") if a != -1: b = headline.rfind(":") if a == b: if a < len(headline) / 2: headline = headline[a + 1:] else: headline = headline[:b] else: headline = headline[a + 1:b] headline_token = word_tokenize(headline) #remove punctuations, replace number with # headline_token = [ t.strip(string.punctuation).lower() for t in headline_token ] # headline_token = [re.sub(r"\d+(\W\d+)*", "#", t) for t in headline_token if t != ""] #ignore if headline is too short if len(headline_token) < 3: continue #process the first paragraph par1_token = word_tokenize(par1) #remove punctuations, replace number with # par1_token = [ t.strip(string.punctuation).lower() for t in par1_token ] # par1_token = [re.sub(r"\d+(\W\d+)*", "#", t) for t in par1_token if t != ""] headline = " ".join([t for t in headline_token]) par1 = " ".join([t for t in par1_token]) obj = {'Headline': headline, "Text": par1} json_str = json.dumps(obj) f.write(json_str + '\n') print("completed file %s" % fname) return fname
def test_CommunicationReader_truncated_tgz_file(): reader = CommunicationReader( 'tests/testdata/simple_1_and_truncated.tar.gz') (simple_comm, _) = reader.next() with raises(EOFError): (truncated_comm, _) = reader.next()
def test_CommunicationReader_truncated_gz_file(): reader = CommunicationReader('tests/testdata/truncated.comm.gz') with raises(EOFError): [comms, filenames] = zip(*[(c, f) for (c, f) in reader])
def main(): set_stdout_encoding() parser = ArgumentParser( formatter_class=ArgumentDefaultsHelpFormatter, description= "Interface with a Concrete AnnotateCommunicationService server. " "Supports either THttp/TJSONProtocol (using the '--uri' flag) " "or TSocket/TCompactProtocol (using '--host'/'--port')") parser.add_argument( '--host', default='localhost', help="Hostname of TSocket/TCompactProtocol AnnotateCommunicationService" ) parser.add_argument( '-p', '--port', type=int, default=9090, help="Port of TSocket/TCompactProtocol AnnotateCommunicationService") parser.add_argument( '--uri', '--url', help="URI of THttpServer/TJSONProtocol AnnotateCommunicationService") parser.add_argument('-l', '--loglevel', '--log-level', help='Logging verbosity level threshold (to stderr)', default='info') parser.add_argument('--input', default='-', help="Input source to use. '-' for stdin; otherwise" " takes a path to a file.") parser.add_argument('--output', default='-', help="Output source to use. '-' for stdout; otherwise" " takes a path to a file.") concrete.version.add_argparse_argument(parser) args = parser.parse_args() logging.basicConfig(format='%(asctime)-15s %(levelname)s: %(message)s', level=args.loglevel.upper()) # Won't work on Windows if args.input == '-': reader_kwargs = dict(filetype=FileType.STREAM) input_path = '/dev/fd/0' else: reader_kwargs = dict() input_path = args.input output_path = '/dev/fd/1' if args.output == '-' else args.output reader = CommunicationReader(input_path, **reader_kwargs) if args.uri: try: with HTTPAnnotateCommunicationClientWrapper(args.uri) as client: with CommunicationWriter(output_path) as writer: for (comm, _) in reader: writer.write(client.annotate(comm)) except TProtocolException as ex: logging.error(ex) logging.error( "Successfully connected to the URI '{}' using HTTP, but the URI does not " "appear to be an AnnotateCommunicationService endpoint that uses the " "Thrift THttp transport and TJSONProtocol encoding".format( args.uri)) else: try: with AnnotateCommunicationClientWrapper(args.host, args.port) as client: with CommunicationWriter(output_path) as writer: for (comm, _) in reader: writer.write(client.annotate(comm)) except TTransportException: pass
def test_CommunicationReader_single_file_unicode(): reader = CommunicationReader( "tests/testdata/les-deux-chandeliers.concrete") [comms, filenames] = zip(*[(c, f) for (c, f) in reader]) assert len(comms) == 1 assert 'tests/testdata/les-deux-chandeliers.txt' == comms[0].id
def main(): # Make stdout output UTF-8, preventing "'ascii' codec can't encode" errors sys.stdout = codecs.getwriter('utf8')(sys.stdout) parser = argparse.ArgumentParser( description="Print information about a Concrete Communication to" " stdout. If communication_filename is specified, read" " communication from file; otherwise, read from standard" " input.", ) parser.add_argument('--count', type=int, help='Print at most this many communications.') parser.add_argument('--annotation-headers', action='store_true', help='Print annotation type headers.') parser.add_argument("--char-offsets", help="Print token text extracted from character offset" "s (not the text stored in the tokenization) in '" "ConLL-style' format", action="store_true") parser.add_argument("--dependency", help="Print HEAD and DEPREL tags for first dependency " "parse in 'ConLL-style' format", action="store_true") parser.add_argument("--dependency-tool", type=str, help='Filter --dependency output to specified ' 'tool (requires --dependency)') parser.add_argument("--entities", help="Print info about all Entities and their EntityMe" "ntions", action="store_true") parser.add_argument("--entities-tool", type=str, help='Filter --entities output to specified ' 'tool (requires --entities)') parser.add_argument("--lemmas", help="Print first set of lemma token tags in 'ConLL-st" "yle' format", action="store_true") parser.add_argument("--lemmas-tool", type=str, help='Filter --lemmas output to specified ' 'tool (requires --lemmas)') parser.add_argument("--metadata", help="Print metadata for tools used to annotate Commun" "ication", action="store_true") parser.add_argument("--metadata-tool", type=str, help='Filter --metadata output to specified ' 'tool (requires --metadata)') parser.add_argument("--communication-taggings", help="Print communication taggings", action="store_true") parser.add_argument("--communication-taggings-tool", type=str, help='Filter --communication-taggings output to ' 'specified tool (requires ' '--communication-taggings)') parser.add_argument("--mentions", help="Print whitespace-separated tokens, with entity m" "entions wrapped using <ENTITY ID=x> tags, where " "'x' is the (zero-indexed) entity number", action="store_true") parser.add_argument("--mentions-tool", type=str, help='Filter --mentions output to specified ' 'tool (requires --mentions)') parser.add_argument("--ner", help="Print first set of Named Entity Recognition toke" "n tags in 'ConLL-style' format", action="store_true") parser.add_argument("--ner-tool", type=str, help='Filter --ner output to specified ' 'tool (requires --ner)') parser.add_argument("--pos", help="Print first set of Part-Of-Speech token tags in " "'ConLL-style' format", action="store_true") parser.add_argument("--pos-tool", type=str, help='Filter --pos output to specified ' 'tool (requires --pos)') parser.add_argument("--sections", action='store_true', help="Print text according to Section offsets" "(textSpan values). These textSpans are assumed " "to be valid.") parser.add_argument("--sections-tool", type=str, help='Filter --sections output to specified ' 'tool (requires --sections)') parser.add_argument("--situation-mentions", help="Print info about all SituationMentions", action="store_true") parser.add_argument("--situation-mentions-tool", type=str, help='Filter --situation-mentions output to specified ' 'tool (requires --situation-mentions)') parser.add_argument("--situations", help="Print info about all Situations and their Situat" "ionMentions", action="store_true") parser.add_argument("--situations-tool", type=str, help='Filter --situations output to specified ' 'tool (requires --situations)') parser.add_argument("--text", help="Print .text field", action="store_true") parser.add_argument("--text-tool", type=str, help='Filter --text output to specified ' 'tool (requires --text)') parser.add_argument("--tokens", help="Print whitespace-seperated tokens for *all* Toke" "nizations in a Communication. Each sentence tok" "enization is printed on a separate line, and " "empty lines indicate a section break", action="store_true") parser.add_argument("--tokens-tool", type=str, help='Filter --tokens output to specified ' 'tool (requires --tokens)') parser.add_argument("--treebank", help="Print Penn-Treebank style parse trees for *all* " "Constituent Parses in the Communication", action="store_true") parser.add_argument("--treebank-tool", type=str, help='Filter --treebank output to specified ' 'tool (requires --treebank)') parser.add_argument("--id", help='Print communication id', action='store_true') parser.add_argument("--id-tool", type=str, help='Filter --id output to specified ' 'tool (requires --id)') parser.add_argument("--no-references", help="Don't add references to communication (may preve" "nt 'NoneType' errors)", action="store_true") parser.add_argument('communication_filename', nargs='?', type=str, help='Path to a Concrete Communication from which ' 'to display information. If not specified, read ' 'read from standard input') concrete.version.add_argparse_argument(parser) args = parser.parse_args() add_references = not args.no_references if args.communication_filename is not None: comms = CommunicationReader(args.communication_filename, add_references=add_references) else: comms = CommunicationReader('/dev/fd/0', add_references=add_references, filetype=FileType.STREAM) if not (args.char_offsets or args.dependency or args.lemmas or args.ner or args.pos or args.entities or args.mentions or args.metadata or args.sections or args.situation_mentions or args.situations or args.text or args.tokens or args.treebank or args.id or args.communication_taggings): parser.print_help() sys.exit(1) if ((args.dependency_tool and not args.dependency) or (args.lemmas_tool and not args.lemmas) or (args.ner_tool and not args.ner) or (args.pos_tool and not args.pos) or (args.entities_tool and not args.entities) or (args.mentions_tool and not args.mentions) or (args.metadata_tool and not args.metadata) or (args.sections_tool and not args.sections) or (args.situation_mentions_tool and not args.situation_mentions) or (args.situations_tool and not args.situations) or (args.text_tool and not args.text) or (args.tokens_tool and not args.tokens) or (args.treebank_tool and not args.treebank) or (args.id_tool and not args.id) or (args.communication_taggings_tool and not args.communication_taggings)): parser.print_help() sys.exit(1) comm_num = 0 for (comm, _) in comms: if args.count is not None and comm_num == args.count: break if args.id: print_header_if('id', args.annotation_headers) concrete.inspect.print_id_for_communication(comm, tool=args.id_tool) if args.text: print_header_if('text', args.annotation_headers) concrete.inspect.print_text_for_communication(comm, tool=args.text_tool) if args.sections: print_header_if('sections', args.annotation_headers) concrete.inspect.print_sections(comm, tool=args.sections_tool) if args.tokens: print_header_if('tokens', args.annotation_headers) concrete.inspect.print_tokens_for_communication( comm, tool=args.tokens_tool) if args.treebank: print_header_if('treebank', args.annotation_headers) concrete.inspect.print_penn_treebank_for_communication( comm, tool=args.treebank_tool) if (args.char_offsets or args.dependency or args.lemmas or args.ner or args.pos): print_header_if('conll', args.annotation_headers) concrete.inspect.print_conll_style_tags_for_communication( comm, char_offsets=args.char_offsets, dependency=args.dependency, lemmas=args.lemmas, ner=args.ner, pos=args.pos, dependency_tool=args.dependency_tool, lemmas_tool=args.lemmas_tool, pos_tool=args.pos_tool, ner_tool=args.ner_tool) if args.entities: print_header_if('entities', args.annotation_headers) concrete.inspect.print_entities(comm, tool=args.entities_tool) if args.mentions: print_header_if('mentions', args.annotation_headers) concrete.inspect.print_tokens_with_entityMentions( comm, tool=args.mentions_tool) if args.situations: print_header_if('situations', args.annotation_headers) concrete.inspect.print_situations(comm, tool=args.situations_tool) if args.situation_mentions: print_header_if('situation mentions', args.annotation_headers) concrete.inspect.print_situation_mentions( comm, tool=args.situation_mentions_tool) if args.communication_taggings: print_header_if('communication taggings', args.annotation_headers) concrete.inspect.print_communication_taggings_for_communication( comm, tool=args.communication_taggings_tool) if args.metadata: print_header_if('metadata', args.annotation_headers) concrete.inspect.print_metadata(comm, tool=args.metadata_tool) comm_num += 1