def read_conll_doc(filename,
                   ans=None,
                   rtext=True,
                   rparses=True,
                   rheads=True,
                   rclusters=True,
                   rner=True,
                   rspeakers=True,
                   rfcol=False):
    if ans is None:
        ans = {}
    cur = []
    keys = None
    for line in codecs.open(filename, 'r', 'utf-8'):
        if len(line) > 0 and line.startswith('#begin') or line.startswith(
                '#end'):
            if 'begin' in line:
                desc = line.split()
                location = desc[2].strip('();')
                keys = (location, desc[-1])
            if len(cur) > 0:
                if keys is None:
                    print >> sys.stderr, "Error reading conll file - invalid #begin statemen\n", line
                else:
                    info = {}
                    if rtext:
                        info['text'] = read_conll_text(cur)
                    if rparses:
                        info['parses'] = read_conll_parses(cur)
                        if rheads:
                            info['heads'] = [
                                head_finder.collins_find_heads(parse)
                                for parse in info['parses']
                            ]
                    if rclusters:
                        info['mentions'], info['clusters'] = read_conll_coref(
                            cur)
                    if rner:
                        info['ner'] = read_conll_ner(cur)
                    if rspeakers:
                        info['speakers'] = read_conll_speakers(cur)
                    if rfcol:
                        info['fcol'] = read_conll_fcol(cur)
                    if keys[0] not in ans:
                        ans[keys[0]] = {}
                    ans[keys[0]][keys[1]] = info
                    keys = None
            cur = []
        else:
            cur.append(line)
    return ans
def read_conll_doc(filename, ans=None, rtext=True, rparses=True, rheads=True, rclusters=True, rner=True, rspeakers=True,
                   rfcol=False):
    # Canasai's addition end
    # Read entire file, inserting into a dictionary:
    #  key - the #begin <blah> info
    #  value - a dict, one entry per part, each entry contains:
    #     - text
    #     - parses
    #     - heads
    #     - coreference clusters
    if ans is None:
        ans = defaultdict(lambda: {})
    cur = []
    keys = None
    # Canasai's comment out: for line in open(filename):
    for line in codecs.open(filename, 'r', 'utf-8'):
        if len(line) > 0 and line.startswith('#begin') or line.startswith('#end'):
            if 'begin' in line:
                desc = line.split()
                location = desc[2].strip('();')
                keys = (location, desc[-1])
            # Canasai's comment out: if "tc/ch/00/ch" in keys[0] and '9' not in keys[0]:
            #	val = int(keys[0].split('_')[-1]) * 10 - 1
            #	keys = ("tc/ch/00/ch_%04d" % val, keys[1])
            if len(cur) > 0:
                if keys is None:
                    print >> sys.stderr, "Error reading conll file - invalid #begin statemen\n", line
                else:
                    info = {}
                    if rtext:
                        info['text'] = read_conll_text(cur)
                    if rparses:
                        info['parses'] = read_conll_parses(cur)
                        if rheads:
                            info['heads'] = [head_finder.collins_find_heads(parse) for parse in info['parses']]
                    if rclusters:
                        info['mentions'], info['clusters'] = read_conll_coref(cur)
                    if rner:
                        info['ner'] = read_conll_ner(cur)
                    # Canasai's addition begin
                    if rspeakers:
                        info['speakers'] = read_conll_speakers(cur)
                    if rfcol:
                        info['fcol'] = read_conll_fcol(cur)
                    # Canasai's addition end
                    ans[keys[0]][keys[1]] = info
                    keys = None
            cur = []
        else:
            cur.append(line)
    return ans
Exemple #3
0
def get_tree_data(sentence_list, parse_list, ner_list, character_to_index,
                  word_to_index, pos_to_index, index_to_lexicon):
    log("get_tree_data()...")
    """ Get tree structured data from CoNLL-2003
    
    Stores into Node data structure
    """
    tree_pyramid_list = []
    word_count = 0
    pos_count = defaultdict(lambda: 0)
    ne_count = defaultdict(lambda: 0)
    pos_ne_count = defaultdict(lambda: 0)
    lexicon_hits = [0]

    for index, parse in enumerate(parse_list):
        text_raw_data = sentence_list[index]
        word_count += len(text_raw_data)
        span_to_node = {}
        head_raw_data = head_finder.collins_find_heads(parse)

        root_node = Node()
        nodes = construct_node(root_node, parse, ner_list[index],
                               head_raw_data, text_raw_data,
                               character_to_index, word_to_index, pos_to_index,
                               index_to_lexicon, pos_count, ne_count,
                               pos_ne_count, lexicon_hits, span_to_node)
        root_node.nodes = nodes
        root_node.tokens = len(text_raw_data)

        additional_node_list = create_dense_nodes(ner_list[index],
                                                  text_raw_data, pos_to_index,
                                                  index_to_lexicon, pos_count,
                                                  ne_count, pos_ne_count,
                                                  lexicon_hits, span_to_node)

        tree_pyramid_list.append((root_node, additional_node_list))

    log(" %d sentences\n" % len(tree_pyramid_list))
    return tree_pyramid_list, word_count, pos_count, ne_count, pos_ne_count, lexicon_hits[
        0]
def read_conll_doc(filename, ans=None, rtext=True, rparses=True, rheads=True, rclusters=True, rner=True, rspeakers=True, rfcol=False):
    if ans is None:
        ans = {}
    cur = []
    keys = None
    for line in codecs.open(filename, 'r', 'utf-8'):
        if len(line) > 0 and line.startswith('#begin') or line.startswith('#end'):
            if 'begin' in line:
                desc = line.split()
                location = desc[2].strip('();')
                keys = (location, desc[-1])
            if len(cur) > 0:
                if keys is None:
                    print >> sys.stderr, "Error reading conll file - invalid #begin statemen\n", line
                else:
                    info = {}
                    if rtext:
                        info['text'] = read_conll_text(cur)
                    if rparses:
                        info['parses'] = read_conll_parses(cur)
                        if rheads:
                            info['heads'] = [head_finder.collins_find_heads(parse) for parse in info['parses']]
                    if rclusters:
                        info['mentions'], info['clusters'] = read_conll_coref(cur)
                    if rner:
                        info['ner'] = read_conll_ner(cur)
                    if rspeakers:
                        info['speakers'] = read_conll_speakers(cur)
                    if rfcol:
                        info['fcol'] = read_conll_fcol(cur)
                    if keys[0] not in ans:
                        ans[keys[0]] = {}
                    ans[keys[0]][keys[1]] = info
                    keys = None
            cur = []
        else:
            cur.append(line)
    return ans
Exemple #5
0
def get_tree_data(sentence_list, parse_list, ner_list,
        character_to_index, word_to_index, pos_to_index, index_to_lexicon):
    log("get_tree_data()...")
    """ Get tree structured data from CoNLL-2003
    
    Stores into Node data structure
    """
    tree_pyramid_list = []
    word_count = 0
    pos_count = defaultdict(lambda: 0)
    ne_count = defaultdict(lambda: 0)
    pos_ne_count = defaultdict(lambda: 0)
    lexicon_hits = [0]
    
    for index, parse in enumerate(parse_list):
        text_raw_data = sentence_list[index]
        word_count += len(text_raw_data)
        span_to_node = {}
        head_raw_data = head_finder.collins_find_heads(parse)
        
        root_node = Node()
        nodes = construct_node(
           root_node, parse, ner_list[index], head_raw_data, text_raw_data,
           character_to_index, word_to_index, pos_to_index, index_to_lexicon,
           pos_count, ne_count, pos_ne_count, lexicon_hits, span_to_node)
        root_node.nodes = nodes
        root_node.tokens = len(text_raw_data)
        
        additional_node_list = create_dense_nodes(
            ner_list[index], text_raw_data,
            pos_to_index, index_to_lexicon,
            pos_count, ne_count, pos_ne_count, lexicon_hits, span_to_node)
        
        tree_pyramid_list.append((root_node, additional_node_list))
        
    log(" %d sentences\n" % len(tree_pyramid_list))
    return tree_pyramid_list, word_count, pos_count, ne_count, pos_ne_count, lexicon_hits[0]