Beispiel #1
0
def tag(tagged_string):
    """ Takes a PNP-tagged sentence and returns a tagged sentence with the added relation tags.
        Tokens in the input string must contain WORD, POS, CHUNK and PNP tags.
        Example relation tags: NP-SBJ-1, VP-1, NP-OBJ-1, NP-SBJ-2, ADJP-CLR, ... (see tags.py)
        Note 1: on rare occasions words can be tagged with multiple relations (e.g. NP-OBJ-1*NP-OBJ-3).
        Note 2: the separator for multiple relation can be "*" OR ";".
    """
    s = _split(tagged_string)
    verbs, commas, chunks, heads, indices, instance_candidates, VP_chunks = _step1(
        s)
    distance, comma_map, verb_map = _step2(verbs, commas, chunks, len(s))
    tags = []
    V = []  # Verb indices collected from _instance().
    I = []  # Instance index per tag.
    for i in instance_candidates:
        instances, verb_indices = _instances(i, distance, verbs, verb_map,
                                             comma_map, heads, indices[i], s)
        instances = [x.replace('/', '*') for x in instances]
        tags.extend(instances)
        V.extend(verb_indices)
        I.extend([i] * len(instances))
    # The client.batch() function in the client module takes care of managing server clients,
    # we simply pass it all the tagging jobs and a definition of the client we need.
    tags = client.batch(tags,
                        client=(client.Timbl, HOST, PORT, config.RELATION,
                                config.log),
                        retries=1)
    # Getting tags for complete chunks:
    chunk_dict = {}
    for i, tag in enumerate(tags):
        ch = chunks[I[i]]
        vp = str(VP_chunks[V[i]])
        if tag != '-':
            if ch in chunk_dict.keys():
                new_tag = '-'.join([tag, vp])
                # Do not append a tag referencing to the same verb twice.
                # Taking the first occurence:
                new_tag_vp = new_tag.split('-')[-1]
                linked_vps = [
                    x.split('-')[-1] for x in chunk_dict[ch].split('*')
                ]
                if new_tag_vp not in linked_vps:
                    chunk_dict[ch] += '*' + new_tag
            else:
                chunk_dict[ch] = '-'.join([tag, vp])
    # Collect NP-SBJ, NP-OBJ etc. relations and add the VP relations.
    # Place the relation tags in the sentence.
    relations = []
    for i, chunk in enumerate(chunks):
        if chunk in chunk_dict:
            relations.append(chunk_dict[chunk])
        elif VP_chunks[i]:
            relations.append('VP-%d' % (VP_chunks[i]))
        else:
            relations.append('O')
    for i in range(len(s)):
        s[i][4] = relations[i]
    return _join(s)
Beispiel #2
0
def tag(tagged_string):
    """ Takes a PNP-tagged sentence and returns a tagged sentence with the added relation tags.
        Tokens in the input string must contain WORD, POS, CHUNK and PNP tags.
        Example relation tags: NP-SBJ-1, VP-1, NP-OBJ-1, NP-SBJ-2, ADJP-CLR, ... (see tags.py)
        Note 1: on rare occasions words can be tagged with multiple relations (e.g. NP-OBJ-1*NP-OBJ-3).
        Note 2: the separator for multiple relation can be "*" OR ";".
    """
    s = _split(tagged_string)
    verbs, commas, chunks, heads, indices, instance_candidates, VP_chunks = _step1(s)
    distance, comma_map, verb_map = _step2(verbs, commas, chunks, len(s))
    tags = []
    V = [] # Verb indices collected from _instance().
    I = [] # Instance index per tag.
    for i in instance_candidates:
        instances, verb_indices = _instances(i, distance, verbs, verb_map, comma_map, heads, indices[i], s)
        instances = [x.replace('/','*') for x in instances]
        tags.extend(instances)
        V.extend(verb_indices)
        I.extend([i] * len(instances))
    # The client.batch() function in the client module takes care of managing server clients,
    # we simply pass it all the tagging jobs and a definition of the client we need.
    tags = client.batch(tags, client=(client.Timbl, HOST, PORT, config.RELATION, config.log), retries=1)    
    # Getting tags for complete chunks:
    chunk_dict = {}
    for i, tag in enumerate(tags):
        ch = chunks[I[i]]
        vp = str(VP_chunks[V[i]])
        if tag != '-':
            if ch in chunk_dict.keys():
                new_tag = '-'.join([tag, vp])
                # Do not append a tag referencing to the same verb twice.
                # Taking the first occurence:
                new_tag_vp = new_tag.split('-')[-1]
                linked_vps = [x.split('-')[-1] for x in chunk_dict[ch].split('*')]
                if new_tag_vp not in linked_vps:
                    chunk_dict[ch] += '*' + new_tag
            else:
                chunk_dict[ch] = '-'.join([tag, vp])
    # Collect NP-SBJ, NP-OBJ etc. relations and add the VP relations.
    # Place the relation tags in the sentence.
    relations = []
    for i, chunk in enumerate(chunks):
        if chunk in chunk_dict:
            relations.append(chunk_dict[chunk])
        elif VP_chunks[i]:
            relations.append('VP-%d' % (VP_chunks[i]))
        else:
            relations.append('O')
    for i in range(len(s)):
        s[i][4] = relations[i]
    return _join(s)
Beispiel #3
0
def _chunk(string):
    """ Takes a tokenized and escaped string where sentences are separated by a new line.
        Returns a string where words have been tagged with their part-of-speech and chunk tags.
        Common part-of-speech tags include NN (noun), VB (verb), JJ (adjective), PP (preposition).
        Common chunk tags include NP (noun phrase), VP (verb phrase), ...
        - input: ['Draw a red car .']
        - output: Draw/VB/I-VP a/DT/I-NP red/JJ/I-NP car/NN/I-NP ././O
    """
    sentences = filter(lambda x: len(x)>0, string.splitlines())
    host = HOSTS[CHUNK]
    port = PORTS[CHUNK]
    # Send the sentences to the TiMBL server.
    # The batch() function in the client module takes care of managing server clients,
    # we simply pass it all the tagging jobs and a definition of the client we need.
    return '\n'.join(client.batch(sentences, client=(client.Mbt, host, port, CHUNK, config.log), retries=1))
Beispiel #4
0
def _chunk(string):
    """ Takes a tokenized and escaped string where sentences are separated by a new line.
        Returns a string where words have been tagged with their part-of-speech and chunk tags.
        Common part-of-speech tags include NN (noun), VB (verb), JJ (adjective), PP (preposition).
        Common chunk tags include NP (noun phrase), VP (verb phrase), ...
        - input: ['Draw a red car .']
        - output: Draw/VB/I-VP a/DT/I-NP red/JJ/I-NP car/NN/I-NP ././O
    """
    sentences = filter(lambda x: len(x) > 0, string.splitlines())
    host = HOSTS[CHUNK]
    port = PORTS[CHUNK]
    # Send the sentences to the TiMBL server.
    # The batch() function in the client module takes care of managing server clients,
    # we simply pass it all the tagging jobs and a definition of the client we need.
    return '\n'.join(
        client.batch(sentences,
                     client=(client.Mbt, host, port, CHUNK, config.log),
                     retries=1))