def tag(tagged_string): """ Takes a PNP-tagged sentence and returns a tagged sentence with the added relation tags. Tokens in the input string must contain WORD, POS, CHUNK and PNP tags. Example relation tags: NP-SBJ-1, VP-1, NP-OBJ-1, NP-SBJ-2, ADJP-CLR, ... (see tags.py) Note 1: on rare occasions words can be tagged with multiple relations (e.g. NP-OBJ-1*NP-OBJ-3). Note 2: the separator for multiple relation can be "*" OR ";". """ s = _split(tagged_string) verbs, commas, chunks, heads, indices, instance_candidates, VP_chunks = _step1( s) distance, comma_map, verb_map = _step2(verbs, commas, chunks, len(s)) tags = [] V = [] # Verb indices collected from _instance(). I = [] # Instance index per tag. for i in instance_candidates: instances, verb_indices = _instances(i, distance, verbs, verb_map, comma_map, heads, indices[i], s) instances = [x.replace('/', '*') for x in instances] tags.extend(instances) V.extend(verb_indices) I.extend([i] * len(instances)) # The client.batch() function in the client module takes care of managing server clients, # we simply pass it all the tagging jobs and a definition of the client we need. tags = client.batch(tags, client=(client.Timbl, HOST, PORT, config.RELATION, config.log), retries=1) # Getting tags for complete chunks: chunk_dict = {} for i, tag in enumerate(tags): ch = chunks[I[i]] vp = str(VP_chunks[V[i]]) if tag != '-': if ch in chunk_dict.keys(): new_tag = '-'.join([tag, vp]) # Do not append a tag referencing to the same verb twice. # Taking the first occurence: new_tag_vp = new_tag.split('-')[-1] linked_vps = [ x.split('-')[-1] for x in chunk_dict[ch].split('*') ] if new_tag_vp not in linked_vps: chunk_dict[ch] += '*' + new_tag else: chunk_dict[ch] = '-'.join([tag, vp]) # Collect NP-SBJ, NP-OBJ etc. relations and add the VP relations. # Place the relation tags in the sentence. relations = [] for i, chunk in enumerate(chunks): if chunk in chunk_dict: relations.append(chunk_dict[chunk]) elif VP_chunks[i]: relations.append('VP-%d' % (VP_chunks[i])) else: relations.append('O') for i in range(len(s)): s[i][4] = relations[i] return _join(s)
def tag(tagged_string): """ Takes a PNP-tagged sentence and returns a tagged sentence with the added relation tags. Tokens in the input string must contain WORD, POS, CHUNK and PNP tags. Example relation tags: NP-SBJ-1, VP-1, NP-OBJ-1, NP-SBJ-2, ADJP-CLR, ... (see tags.py) Note 1: on rare occasions words can be tagged with multiple relations (e.g. NP-OBJ-1*NP-OBJ-3). Note 2: the separator for multiple relation can be "*" OR ";". """ s = _split(tagged_string) verbs, commas, chunks, heads, indices, instance_candidates, VP_chunks = _step1(s) distance, comma_map, verb_map = _step2(verbs, commas, chunks, len(s)) tags = [] V = [] # Verb indices collected from _instance(). I = [] # Instance index per tag. for i in instance_candidates: instances, verb_indices = _instances(i, distance, verbs, verb_map, comma_map, heads, indices[i], s) instances = [x.replace('/','*') for x in instances] tags.extend(instances) V.extend(verb_indices) I.extend([i] * len(instances)) # The client.batch() function in the client module takes care of managing server clients, # we simply pass it all the tagging jobs and a definition of the client we need. tags = client.batch(tags, client=(client.Timbl, HOST, PORT, config.RELATION, config.log), retries=1) # Getting tags for complete chunks: chunk_dict = {} for i, tag in enumerate(tags): ch = chunks[I[i]] vp = str(VP_chunks[V[i]]) if tag != '-': if ch in chunk_dict.keys(): new_tag = '-'.join([tag, vp]) # Do not append a tag referencing to the same verb twice. # Taking the first occurence: new_tag_vp = new_tag.split('-')[-1] linked_vps = [x.split('-')[-1] for x in chunk_dict[ch].split('*')] if new_tag_vp not in linked_vps: chunk_dict[ch] += '*' + new_tag else: chunk_dict[ch] = '-'.join([tag, vp]) # Collect NP-SBJ, NP-OBJ etc. relations and add the VP relations. # Place the relation tags in the sentence. relations = [] for i, chunk in enumerate(chunks): if chunk in chunk_dict: relations.append(chunk_dict[chunk]) elif VP_chunks[i]: relations.append('VP-%d' % (VP_chunks[i])) else: relations.append('O') for i in range(len(s)): s[i][4] = relations[i] return _join(s)
def _chunk(string): """ Takes a tokenized and escaped string where sentences are separated by a new line. Returns a string where words have been tagged with their part-of-speech and chunk tags. Common part-of-speech tags include NN (noun), VB (verb), JJ (adjective), PP (preposition). Common chunk tags include NP (noun phrase), VP (verb phrase), ... - input: ['Draw a red car .'] - output: Draw/VB/I-VP a/DT/I-NP red/JJ/I-NP car/NN/I-NP ././O """ sentences = filter(lambda x: len(x)>0, string.splitlines()) host = HOSTS[CHUNK] port = PORTS[CHUNK] # Send the sentences to the TiMBL server. # The batch() function in the client module takes care of managing server clients, # we simply pass it all the tagging jobs and a definition of the client we need. return '\n'.join(client.batch(sentences, client=(client.Mbt, host, port, CHUNK, config.log), retries=1))
def _chunk(string): """ Takes a tokenized and escaped string where sentences are separated by a new line. Returns a string where words have been tagged with their part-of-speech and chunk tags. Common part-of-speech tags include NN (noun), VB (verb), JJ (adjective), PP (preposition). Common chunk tags include NP (noun phrase), VP (verb phrase), ... - input: ['Draw a red car .'] - output: Draw/VB/I-VP a/DT/I-NP red/JJ/I-NP car/NN/I-NP ././O """ sentences = filter(lambda x: len(x) > 0, string.splitlines()) host = HOSTS[CHUNK] port = PORTS[CHUNK] # Send the sentences to the TiMBL server. # The batch() function in the client module takes care of managing server clients, # we simply pass it all the tagging jobs and a definition of the client we need. return '\n'.join( client.batch(sentences, client=(client.Mbt, host, port, CHUNK, config.log), retries=1))