Esempio n. 1
0
    def parse_test(self):
        path = '/Users/rgeorgi/Documents/treebanks/LDC95T07/RAW/combined/wsj/00/wsj_0001.mrg'

        tc = CountDict()

        def count_tokens(tokens):
            for token in tokens:
                tc.add(token.label)

        process_wsj_file(path, count_tokens)

        # There should be 31 total tokens in this file.
        self.assertEqual(31, tc.total())

        self.assertEqual(tc['.'], 2)
Esempio n. 2
0
def process_file(path, tm, delimeter='/'):

    c = POSEvalDict()

    def add_to_dict(tokens):
        for token in tokens:

            # Do the tagset remapping.
            if tm is not None:
                if token.label not in tm:
                    DICT_LOG.warn('Tagmap defined, but "{}" not found.'.format(token.label))
                    label = token.label
                else:
                    label = tm[token.label]
            else:
                label = token.label

            c.add(token.seq.lower(), label)

    print('Reading file "{}"'.format(os.path.basename(path)))
    ext = os.path.splitext(path)[1]

    # If the specified file extension is ".mrg", treat it as a WSJ file.
    if ext == '.mrg':
        cur_token_count, cur_linecount = process_wsj_file(path, add_to_dict)

    # Otherwise, assume it is a slashtag file.
    else:
        cur_token_count, cur_linecount = process_slashtag_file(path, add_to_dict, delimeter=delimeter)

    return c, cur_token_count, cur_linecount