コード例 #1
0
def test_CommunicationWriterTar_single_file_fixed_point_unicode(
        output_file, login_info):
    comm = read_communication_from_file(
        "tests/testdata/les-deux-chandeliers.concrete")
    with CommunicationWriterTar(output_file) as writer:
        writer.write(comm, "les-deux-chandeliers.concrete")

    assert tarfile.is_tarfile(output_file)

    f = tarfile.open(output_file)

    tarinfo = f.next()
    assert tarinfo is not None

    assert "les-deux-chandeliers.concrete" == tarinfo.name
    actual_data = f.extractfile(tarinfo).read()
    with open('tests/testdata/les-deux-chandeliers.concrete',
              'rb') as expected_f:
        expected_data = expected_f.read()
        assert expected_data == actual_data

    tarinfo = f.next()
    assert tarinfo is None

    f.close()
コード例 #2
0
def test_CommunicationWriterTar_single_file_ctx_mgr(output_file, login_info):
    comm = read_communication_from_file("tests/testdata/simple_1.concrete")
    with CommunicationWriterTar(output_file) as writer:
        writer.write(comm, "simple_1.concrete")

    assert tarfile.is_tarfile(output_file)

    f = tarfile.open(output_file)

    tarinfo = f.next()
    assert tarinfo is not None

    assert "simple_1.concrete" == tarinfo.name
    assert tarinfo.isreg()
    assert tarinfo.mtime > time() - TIME_MARGIN
    assert os.stat('tests/testdata/simple_1.concrete').st_size == tarinfo.size
    assert 0o644 == tarinfo.mode
    assert login_info['uid'] == tarinfo.uid
    assert login_info['username'] == tarinfo.uname
    assert login_info['gid'] == tarinfo.gid
    assert login_info['groupname'] == tarinfo.gname

    tarinfo = f.next()
    assert tarinfo is None

    f.close()
コード例 #3
0
def test_CommunicationWriterTGZ_single_file_default_name(output_file,
                                                         login_info):
    comm = read_communication_from_file("tests/testdata/simple_1.concrete")
    writer = CommunicationWriterTGZ()
    try:
        writer.open(output_file)
        writer.write(comm)
    finally:
        writer.close()

    assert tarfile.is_tarfile(output_file)

    f = tarfile.open(output_file)

    tarinfo = f.next()
    assert tarinfo is not None

    assert comm.id + '.concrete' == tarinfo.name
    assert tarinfo.isreg()
    assert tarinfo.mtime > time() - TIME_MARGIN
    assert os.stat('tests/testdata/simple_1.concrete').st_size == tarinfo.size
    assert 0o644 == tarinfo.mode
    assert login_info['uid'] == tarinfo.uid
    assert login_info['username'] == tarinfo.uname
    assert login_info['gid'] == tarinfo.gid
    assert login_info['groupname'] == tarinfo.gname

    tarinfo = f.next()
    assert tarinfo is None

    f.close()
コード例 #4
0
def test_CommunicationWriterTGZ_single_file_default_name(
        output_file, login_info):
    comm = read_communication_from_file("tests/testdata/simple_1.concrete")
    writer = CommunicationWriterTGZ()
    try:
        writer.open(output_file)
        writer.write(comm)
    finally:
        writer.close()

    assert tarfile.is_tarfile(output_file)

    f = tarfile.open(output_file)

    tarinfo = f.next()
    assert tarinfo is not None

    assert comm.id + '.concrete' == tarinfo.name
    assert tarinfo.isreg()
    assert tarinfo.mtime > time() - TIME_MARGIN
    assert os.stat('tests/testdata/simple_1.concrete').st_size == tarinfo.size
    assert 0o644 == tarinfo.mode
    assert login_info['uid'] == tarinfo.uid
    assert login_info['username'] == tarinfo.uname
    assert login_info['gid'] == tarinfo.gid
    assert login_info['groupname'] == tarinfo.gname

    tarinfo = f.next()
    assert tarinfo is None

    f.close()
コード例 #5
0
def test_CommunicationWriterTar_single_file_fixed_point_unicode(output_file,
                                                                login_info):
    comm = read_communication_from_file(
        "tests/testdata/les-deux-chandeliers.concrete"
    )
    with CommunicationWriterTar(output_file) as writer:
        writer.write(comm, "les-deux-chandeliers.concrete")

    assert tarfile.is_tarfile(output_file)

    f = tarfile.open(output_file)

    tarinfo = f.next()
    assert tarinfo is not None

    assert "les-deux-chandeliers.concrete" == tarinfo.name
    actual_data = f.extractfile(tarinfo).read()
    with open('tests/testdata/les-deux-chandeliers.concrete',
              'rb') as expected_f:
        expected_data = expected_f.read()
        assert expected_data == actual_data

    tarinfo = f.next()
    assert tarinfo is None

    f.close()
コード例 #6
0
def test_CommunicationWriterTar_single_file_ctx_mgr(output_file, login_info):
    comm = read_communication_from_file("tests/testdata/simple_1.concrete")
    with CommunicationWriterTar(output_file) as writer:
        writer.write(comm, "simple_1.concrete")

    assert tarfile.is_tarfile(output_file)

    f = tarfile.open(output_file)

    tarinfo = f.next()
    assert tarinfo is not None

    assert "simple_1.concrete" == tarinfo.name
    assert tarinfo.isreg()
    assert tarinfo.mtime > time() - TIME_MARGIN
    assert os.stat('tests/testdata/simple_1.concrete').st_size == tarinfo.size
    assert 0o644 == tarinfo.mode
    assert login_info['uid'] == tarinfo.uid
    assert login_info['username'] == tarinfo.uname
    assert login_info['gid'] == tarinfo.gid
    assert login_info['groupname'] == tarinfo.gname

    tarinfo = f.next()
    assert tarinfo is None

    f.close()
コード例 #7
0
def main():
    usage = "%prog [options] <input path> <output path>"
    parser = optparse.OptionParser(usage=usage)
    (options, args) = parser.parse_args(sys.argv)

    if len(args) != 3:
        parser.print_help()
        sys.exit(1)

    in_path = args[1]
    out_path = args[2]

    # in_path = "/mnt/d/MyProjects/ACE2005/preprocess/ace-05-comms-ptb-anno-chunks/AFP_ENG_20030616.0715.concrete"
    # in_path = "/mnt/d/MyProjects/AFP_ENG_20030616.0715.1.concrete"
    # out_path = "/mnt/d/MyProjects/AFP_ENG_20030616.0715.json"

    if not os.path.exists(in_path):
        raise Exception("Input path doesn't exist: " + in_path)

    comm = read_communication_from_file(in_path)
    js = comm2json(comm)

    with open(out_path, "wb") as out_file:
        json.dump(js, out_file, encoding="utf-8")

    print("From %s to %s done." % (in_path, out_path))
コード例 #8
0
ファイル: json_fu.py プロジェクト: fmof/concrete-python
def communication_file_to_json(communication_filename):
    """
    Takes a Communication filename, deserializes Communication from
    file, returns a JSON string with the information in that
    Communication.
    """
    comm = read_communication_from_file(communication_filename)
    return communication_to_json(comm)
コード例 #9
0
def test_CommunicationWriter_gz_fixed_point_unicode(output_file):
    input_file = 'tests/testdata/les-deux-chandeliers.concrete'
    comm = read_communication_from_file(input_file)

    with CommunicationWriter(output_file, gzip=True) as writer:
        writer.write(comm)

    with open(input_file, 'rb') as expected_f:
        expected_data = expected_f.read()
        with gzip.open(output_file, 'rb') as actual_f:
            actual_data = actual_f.read()
            assert expected_data == actual_data
コード例 #10
0
def test_CommunicationWriter_fixed_point_ctx_mgr(output_file):
    input_file = 'tests/testdata/simple_1.concrete'
    comm = read_communication_from_file(input_file)

    with CommunicationWriter(output_file) as writer:
        writer.write(comm)

    with open(input_file, 'rb') as expected_f:
        expected_data = expected_f.read()
        with open(output_file, 'rb') as actual_f:
            actual_data = actual_f.read()
            assert expected_data == actual_data
コード例 #11
0
def test_CommunicationWriter_gz_fixed_point_unicode(output_file):
    input_file = 'tests/testdata/les-deux-chandeliers.concrete'
    comm = read_communication_from_file(input_file)

    with CommunicationWriter(output_file, gzip=True) as writer:
        writer.write(comm)

    with open(input_file, 'rb') as expected_f:
        expected_data = expected_f.read()
        with gzip.open(output_file, 'rb') as actual_f:
            actual_data = actual_f.read()
            assert expected_data == actual_data
コード例 #12
0
def test_CommunicationWriter_fixed_point_ctx_mgr(output_file):
    input_file = 'tests/testdata/simple_1.concrete'
    comm = read_communication_from_file(input_file)

    with CommunicationWriter(output_file) as writer:
        writer.write(comm)

    with open(input_file, 'rb') as expected_f:
        expected_data = expected_f.read()
        with open(output_file, 'rb') as actual_f:
            actual_data = actual_f.read()
            assert expected_data == actual_data
コード例 #13
0
def add_chunks_to_file(in_file, out_file, chunklink, fail_on_error):
    '''Reads a Communication file, adds chunking information, and writes a new 
    Communication file containing the annotated version.'''
    # Deserialize
    comm = read_communication_from_file(in_file)
    
    # Add chunks
    num_chunked, num_sents = add_chunks_to_comm(comm, chunklink, fail_on_error)
    logging.info("Chunked %d / %d = %f" % (num_chunked, num_sents,  float(num_chunked) / float(num_sents)))
    
    # Serialize
    write_communication_to_file(comm, out_file)                
コード例 #14
0
def main():
    set_stdout_encoding()

    parser = argparse.ArgumentParser(
        description="Pretty Print a Concrete file")
    parser.add_argument('--concrete_type', default='communication',
                        choices=['communication', 'tokenlattice'],
                        help='Default: communication')
    parser.add_argument('--protocol', default='simple',
                        choices=['simple', 'TJSONProtocol'],
                        help='Default: simple')
    parser.add_argument('--remove-timestamps', action='store_true',
                        help="Removes timestamps from JSON output")
    parser.add_argument('--remove-uuids', action='store_true',
                        help="Removes UUIDs from JSON output")
    parser.add_argument('-l', '--loglevel', '--log-level',
                        help='Logging verbosity level threshold (to stderr)',
                        default='info')
    parser.add_argument('concrete_file',
                        help='path to input concrete communication file')
    parser.add_argument('json_file', nargs='?', default='-',
                        help='path to output json file')
    concrete.version.add_argparse_argument(parser)
    args = parser.parse_args()

    logging.basicConfig(format='%(asctime)-15s %(levelname)s: %(message)s',
                        level=args.loglevel.upper())

    if args.protocol == 'simple':
        if args.concrete_type == 'communication':
            json_communication = communication_file_to_json(
                args.concrete_file,
                remove_timestamps=args.remove_timestamps,
                remove_uuids=args.remove_uuids
            )
        else:
            json_communication = tokenlattice_file_to_json(
                args.concrete_file
            )
    else:
        if args.concrete_type == 'communication':
            comm = read_communication_from_file(args.concrete_file)
            json_communication = TSerialization.serialize(
                comm, TJSONProtocol.TJSONProtocolFactory()).decode('utf-8')
        else:
            raise NotImplementedError

    if args.json_file == '-':
        print(json_communication)
    else:
        with codecs.open(args.json_file, 'w', encoding='utf-8') as f:
            f.write(json_communication)
コード例 #15
0
def add_chunks_to_file(in_file, out_file, chunklink, fail_on_error):
    '''Reads a Communication file, adds chunking information, and writes a new 
    Communication file containing the annotated version.'''
    # Deserialize
    comm = read_communication_from_file(in_file)

    # Add chunks
    num_chunked, num_sents = add_chunks_to_comm(comm, chunklink, fail_on_error)
    logging.info(
        "Chunked %d / %d = %f" %
        (num_chunked, num_sents, float(num_chunked) / float(num_sents)))

    # Serialize
    write_communication_to_file(comm, out_file)
コード例 #16
0
def test_CommunicationWriterZip_single_file_ctx_mgr(output_file, login_info):
    comm = read_communication_from_file("tests/testdata/simple_1.concrete")
    with CommunicationWriterZip(output_file) as writer:
        writer.write(comm, "simple_1.concrete")

    assert zipfile.is_zipfile(output_file)

    f = zipfile.ZipFile(output_file)

    [zipinfo] = f.infolist()

    assert "simple_1.concrete" == zipinfo.filename
    assert timegm(zipinfo.date_time) > timegm(localtime()) - TIME_MARGIN
    assert os.stat('tests/testdata/simple_1.concrete').st_size == zipinfo.file_size

    f.close()
コード例 #17
0
def test_CommunicationWriter_gz_fixed_point(output_file):
    input_file = 'tests/testdata/simple_1.concrete'
    comm = read_communication_from_file(input_file)

    writer = CommunicationWriter(gzip=True)
    try:
        writer.open(output_file)
        writer.write(comm)
    finally:
        writer.close()

    with open(input_file, 'rb') as expected_f:
        expected_data = expected_f.read()
        with gzip.open(output_file, 'rb') as actual_f:
            actual_data = actual_f.read()
            assert expected_data == actual_data
コード例 #18
0
def test_CommunicationWriter_gz_fixed_point(output_file):
    input_file = 'tests/testdata/simple_1.concrete'
    comm = read_communication_from_file(input_file)

    writer = CommunicationWriter(gzip=True)
    try:
        writer.open(output_file)
        writer.write(comm)
    finally:
        writer.close()

    with open(input_file, 'rb') as expected_f:
        expected_data = expected_f.read()
        with gzip.open(output_file, 'rb') as actual_f:
            actual_data = actual_f.read()
            assert expected_data == actual_data
コード例 #19
0
def test_CommunicationWriterZip_single_file_ctx_mgr(output_file, login_info):
    comm = read_communication_from_file("tests/testdata/simple_1.concrete")
    with CommunicationWriterZip(output_file) as writer:
        writer.write(comm, "simple_1.concrete")

    assert zipfile.is_zipfile(output_file)

    f = zipfile.ZipFile(output_file)

    [zipinfo] = f.infolist()

    assert "simple_1.concrete" == zipinfo.filename
    assert timegm(zipinfo.date_time) > timegm(localtime()) - TIME_MARGIN
    assert os.stat(
        'tests/testdata/simple_1.concrete').st_size == zipinfo.file_size

    f.close()
コード例 #20
0
def communication_file_to_json(communication_filename, remove_timestamps=False,
                               remove_uuids=False):
    """Get a "pretty-printed" JSON string representation for a Communication

    Args:

    - `communication_filename`: String specifying Communication filename
    - `remove_uuids`: Boolean flag indicating if Concrete UUIDs should be
                      removed

    Returns:

    - A string containing a "pretty-printed" JSON representation of the
      Communication
    """
    comm = read_communication_from_file(communication_filename)
    return thrift_to_json(comm, remove_timestamps=remove_timestamps,
                          remove_uuids=remove_uuids)
コード例 #21
0
def main():
    # Make stdout output UTF-8, preventing "'ascii' codec can't encode" errors
    sys.stdout = codecs.getwriter('utf8')(sys.stdout)

    parser = argparse.ArgumentParser(
        description="Pretty Print a Concrete file")
    parser.add_argument('--concrete_type', default='communication',
                        choices=['communication', 'tokenlattice'],
                        help='Default: communication')
    parser.add_argument('--protocol', default='simple',
                        choices=['simple', 'TJSONProtocol'],
                        help='Default: simple')
    parser.add_argument('--remove-timestamps', action='store_true',
                        help="Removes timestamps from JSON output")
    parser.add_argument('--remove-uuids', action='store_true',
                        help="Removes UUIDs from JSON output")
    parser.add_argument('concrete_file')
    parser.add_argument('json_file', nargs='?', default='STDOUT')
    concrete.version.add_argparse_argument(parser)
    args = parser.parse_args()

    if args.protocol == 'simple':
        if args.concrete_type == 'communication':
            json_communication = communication_file_to_json(
                args.concrete_file,
                remove_timestamps=args.remove_timestamps,
                remove_uuids=args.remove_uuids
            )
        else:
            json_communication = tokenlattice_file_to_json(args.concrete_file)
    else:
        if args.concrete_type == 'communication':
            comm = read_communication_from_file(args.concrete_file)
            json_communication = TSerialization.serialize(
                comm, TJSONProtocol.TJSONProtocolFactory())
        else:
            raise NotImplementedError

    if args.json_file == 'STDOUT':
        print json_communication
    else:
        f = codecs.open(args.json_file, "w", encoding="utf-8")
        f.write(json_communication)
        f.close()
コード例 #22
0
def test_CommunicationWriterZip_single_file_fixed_point(
        output_file, login_info):
    comm = read_communication_from_file("tests/testdata/simple_1.concrete")
    with CommunicationWriterZip(output_file) as writer:
        writer.write(comm, "simple_1.concrete")

    assert zipfile.is_zipfile(output_file)

    f = zipfile.ZipFile(output_file)

    [zipinfo] = f.infolist()

    assert "simple_1.concrete" == zipinfo.filename
    actual_data = f.open(zipinfo).read()
    with open('tests/testdata/simple_1.concrete', 'rb') as expected_f:
        expected_data = expected_f.read()
        assert expected_data == actual_data

    f.close()
コード例 #23
0
def test_CommunicationWriterZip_single_file_fixed_point(output_file,
                                                        login_info):
    comm = read_communication_from_file("tests/testdata/simple_1.concrete")
    with CommunicationWriterZip(output_file) as writer:
        writer.write(comm, "simple_1.concrete")

    assert zipfile.is_zipfile(output_file)

    f = zipfile.ZipFile(output_file)

    [zipinfo] = f.infolist()

    assert "simple_1.concrete" == zipinfo.filename
    actual_data = f.open(zipinfo).read()
    with open('tests/testdata/simple_1.concrete', 'rb') as expected_f:
        expected_data = expected_f.read()
        assert expected_data == actual_data

    f.close()
コード例 #24
0
def communication_file_to_json(communication_filename,
                               remove_timestamps=False,
                               remove_uuids=False):
    """Get a "pretty-printed" JSON string representation for a Communication

    Args:

    - `communication_filename`: String specifying Communication filename
    - `remove_uuids`: Boolean flag indicating if Concrete UUIDs should be
                      removed

    Returns:

    - A string containing a "pretty-printed" JSON representation of the
      Communication
    """
    comm = read_communication_from_file(communication_filename)
    return thrift_to_json(comm,
                          remove_timestamps=remove_timestamps,
                          remove_uuids=remove_uuids)
コード例 #25
0
def test_CommunicationWriterZip_single_file_default_name(output_file,
                                                         login_info):
    comm = read_communication_from_file("tests/testdata/simple_1.concrete")
    writer = CommunicationWriterZip()
    try:
        writer.open(output_file)
        writer.write(comm)
    finally:
        writer.close()

    assert zipfile.is_zipfile(output_file)

    f = zipfile.ZipFile(output_file)

    [zipinfo] = f.infolist()

    assert comm.id + '.concrete' == zipinfo.filename
    assert timegm(zipinfo.date_time) > timegm(localtime()) - TIME_MARGIN
    assert os.stat('tests/testdata/simple_1.concrete').st_size == zipinfo.file_size

    f.close()
コード例 #26
0
def test_CommunicationWriterZip_single_file_default_name(
        output_file, login_info):
    comm = read_communication_from_file("tests/testdata/simple_1.concrete")
    writer = CommunicationWriterZip()
    try:
        writer.open(output_file)
        writer.write(comm)
    finally:
        writer.close()

    assert zipfile.is_zipfile(output_file)

    f = zipfile.ZipFile(output_file)

    [zipinfo] = f.infolist()

    assert comm.id + '.concrete' == zipinfo.filename
    assert timegm(zipinfo.date_time) > timegm(localtime()) - TIME_MARGIN
    assert os.stat(
        'tests/testdata/simple_1.concrete').st_size == zipinfo.file_size

    f.close()
コード例 #27
0
def serve():
    train_test_url = opj(args.data_dir, 'train_test.feat')
    entity_map_url = opj(args.data_dir, 'entity.map')
    feat_map_url = opj(args.data_dir, 'vocab.new')
    entity_sent_url = opj(args.data_dir, 'entities.sentences')
    guid2name = {}
    guid2id = {}
    id2guid = {}
    guid2sent = {}
    # The train_test.feat file contains some entities such as number 1997
    # that has no features. Its feature line is blank.
    # These entities were removed while training the neural network architecture.
    # Therefore to map the embeddings in NVGE back to the KB we need to use this
    # alignment information. This information is not necessary for BS because BS
    # can easily handle the fact that some entities have no features (ie. the )
    # document is empty.
    data_set, data_count, alignment = utils.data_set(train_test_url)
    for idx, row in enumerate(
            codecs.open(entity_map_url, 'r', 'utf-8').read().split('\n')):
        if row == '': continue
        dbid, canonical = row.split('\t')
        guid2name[dbid] = canonical
        if idx in alignment:
            guid2id[dbid] = alignment[idx]
            id2guid[alignment[idx]] = dbid

    GUID2SENT_PKL_FILE = opj(args.data_dir, os.path.pardir, 'guid2sent.pkl')
    try:
        print 'Loading', GUID2SENT_PKL_FILE
        guid2sent = pkl.load(open(GUID2SENT_PKL_FILE))
    except:
        print 'Could not find', GUID2SENT_PKL_FILE
        concrete_entity_files = os.listdir(args.concrete_entity_dir)
        for commidx, filename in enumerate(concrete_entity_files):
            print '%-5d\r' % ((commidx * 100) / len(concrete_entity_files)),
            comm = read_communication_from_file(
                opj(args.concrete_entity_dir, filename))
            guid = comm.id
            for sent in comm.sectionList[0].sentenceList:
                uuid = sent.uuid.uuidString
                tokens = [
                    e.text for e in sent.tokenization.tokenList.tokenList
                ]
                try:
                    guid2sent[guid].append((uuid, tokens))
                except KeyError:
                    guid2sent[guid] = [(uuid, tokens)]
        with open(GUID2SENT_PKL_FILE, 'wb') as gpf:
            print 'Dumping', GUID2SENT_PKL_FILE
            pkl.dump(guid2sent, gpf)

    # for row in codecs.open(entity_sent_url, 'r', 'utf-8').read().split('\n'):
    #     row = row.split(' ||| ')
    #     guid = row[0]
    #     for sent in row[1:]:
    #         tokens = sent.split()
    #         try:
    #             guid2sent[guid].append(tokens)
    #         except KeyError:
    #             guid2sent[guid] = [tokens]
    id2feat_data = codecs.open(feat_map_url, 'r', 'utf-8').read().split('\n')
    id2feat = dict((((sum(1 for e in id2feat_data if e != '') -
                      1) if idx == 0 else (idx - 1)), row.split()[0])
                   for idx, row in enumerate(id2feat_data) if row != '')
    print('Checking feature size =',
          len(data_set[guid2id[":Entity_ENG_EDL_0092354"]]), 'for',
          guid2name[":Entity_ENG_EDL_0092354"], 'max(id2feat.values())',
          max(id2feat.keys()))

    def load(args):
        import cPickle as pkl
        with open(opj(args.data_dir, args.model_pkl), 'rb') as f:
            nnp = pkl.load(f)
        return nnp

    handler = EntitySearchProvider(
        args.language,
        NVBS(data_set=data_set,
             nnp=load(args),
             method=getattr(NVBSALGO, args.algorithm),
             opts=args,
             id2guid=id2guid,
             guid2id=guid2id,
             guid2name=guid2name,
             guid2sent=guid2sent,
             id2feat=id2feat), args.k_query, args.k_rationale)
    server = SearchServiceWrapper(handler)
    if args.serve:
        print('Starting NVBS Server')
        server.serve(args.host, args.port)
    else:
        return handler.index
コード例 #28
0
def read_test_comm():
    communication_filename = "tests/testdata/serif_dog-bites-man.concrete"
    return read_communication_from_file(communication_filename)
コード例 #29
0
def from_concrete_file(comm_file: str,
                       task: str = 'argidcls') -> Document:
    def _entity_mention_to_span_indices(em: EntityMention) -> Tuple[int, int]:
        sentid: int = tok_to_sentid[em.tokens.tokenizationId.uuidString]
        start: int = doc.local_to_global(sent_id=sentid, local_idx=em.tokens.tokenIndexList[0])
        end: int = doc.local_to_global(sent_id=sentid, local_idx=em.tokens.tokenIndexList[-1])
        return start, end

    def _normalize_token(t: str) -> str:
        # For ACE dataset
        if t == '\'\'':
            return '"'
        elif t == '``':
            return '"'
        elif t == '-LRB-':
            return '('
        elif t == '-RRB-':
            return ')'
        elif t == '-LSB-':
            return '['
        elif t == '-RSB-':
            return ']'
        elif t == '-LCB-':
            return '{'
        elif t == '-RCB-':
            return '}'
        else:
            return t

    def _normalize_role(r: str) -> str:
        if 'Time' in r:
            return 'Time'
        else:
            return r

    comm: Communication = read_communication_from_file(comm_file)
    tok_to_sentid: Dict[str, int] = {}
    sentences: List[List[str]] = []
    # extract tokens to form sentences
    for sent_id, tok in enumerate(get_comm_tokenizations(comm)):
        tok_to_sentid[tok.uuid.uuidString] = sent_id
        sentences.append([
            _normalize_token(t.text)
            for t in tok.tokenList.tokenList
        ])
    doc: Document = Document(doc_key=str(comm.id),
                             events=[],
                             sentences=sentences)

    # convert SituationMention into Event objects
    for sm in comm.situationMentionSetList[0].mentionList:
        if sm.situationType != 'EVENT':
            continue
        event: Event = Event(document=doc,
                             kind=sm.situationKind,
                             arguments=[])
        for arg in sm.argumentList:
            if arg.entityMentionId is not None:
                arg_entity_mention = comm.entityMentionForUUID[arg.entityMentionId.uuidString]
            elif arg.situationMentionId is not None:
                arg_entity_mention = comm.situationMentionForUUID[arg.situationMentionId.uuidString]
            else:
                raise ValueError
            start_idx, end_idx = _entity_mention_to_span_indices(em=arg_entity_mention)
            if arg.role == 'TRIGGER':
                event.trigger = Trigger(start=start_idx,
                                        end=end_idx,
                                        document=doc)
            else:
                event.arguments.append(
                    Argument(start=start_idx,
                             end=end_idx,
                             role=_normalize_role(arg.role),  # TODO(Yunmo): Ensure that there is only one Time
                             document=doc)
                )
        if event.trigger is None:
            if sm.tokens is None:
                start_idx, end_idx = (0, len(doc.to) - 1)
            else:
                start_idx, end_idx = _entity_mention_to_span_indices(sm)
            event.trigger = Trigger(start=start_idx, end=end_idx, document=doc)
        doc.events.append(event)

    doc.argument_mentions: List[Span] = []
    if task == 'argidcls-noisy':
        # add all possible for `argidcls`
        for em in comm.entityMentionSetList[0].mentionList:
            start_idx, end_idx = _entity_mention_to_span_indices(em=em)
            doc.argument_mentions.append(Span(start=start_idx,
                                              end=end_idx,
                                              document=doc))
    elif task == 'argcls' or task == 'argidcls':
        for event in doc.events:
            for arg in event.arguments:
                doc.argument_mentions.append(Span(start=arg.start,
                                                  end=arg.end,
                                                  document=doc))
    # else:
    #     raise NotImplemented

    return doc