Beispiel #1
0
    def test_senna_chunk_tagger(self):
        chktagger = SennaChunkTagger(SENNA_EXECUTABLE_PATH)
        result_1 = chktagger.tag('What is the airspeed of an unladen swallow ?'.split())
        expected_1 = [('What', 'B-NP'), ('is', 'B-VP'), ('the', 'B-NP'), ('airspeed',
            'I-NP'), ('of', 'B-PP'), ('an', 'B-NP'), ('unladen', 'I-NP'), ('swallow',
            'I-NP'), ('?', 'O')]

        result_2 = list(chktagger.bio_to_chunks(result_1, chunk_type='NP'))
        expected_2 = [('What', '0'), ('the airspeed', '2-3'), ('an unladen swallow',
            '5-6-7')]
        self.assertEqual(result_1, expected_1)
        self.assertEqual(result_2, expected_2)
Beispiel #2
0
    def test_senna_chunk_tagger(self):
        chktagger = SennaChunkTagger(SENNA_EXECUTABLE_PATH)
        result_1 = chktagger.tag(
            'What is the airspeed of an unladen swallow ?'.split())
        expected_1 = [('What', 'B-NP'), ('is', 'B-VP'), ('the', 'B-NP'),
                      ('airspeed', 'I-NP'), ('of', 'B-PP'), ('an', 'B-NP'),
                      ('unladen', 'I-NP'), ('swallow', 'I-NP'), ('?', 'O')]

        result_2 = list(chktagger.bio_to_chunks(result_1, chunk_type='NP'))
        expected_2 = [('What', '0'), ('the airspeed', '2-3'),
                      ('an unladen swallow', '5-6-7')]
        self.assertEqual(result_1, expected_1)
        self.assertEqual(result_2, expected_2)
Beispiel #3
0
    def __init__(self):
        '''
        if phrase_dict_json != None: extract the phrase features
        if subtype_flag = True, extract the features by sub parse_type
        if bioe_flag = True, use the BIOE tags
        '''
        self.features = [
            'pos', 'chunk', 'promptword', 'stopword', 'tf', 'rank', 'color'
        ]

        if 'pos' in self.features:
            self.pos_tagger = SennaTagger(global_params.sennadir)

        if 'chunk' in self.features:
            self.chunk_tagger = SennaChunkTagger(global_params.sennadir)

        self.sentences = []

        self.porter = PorterStemmer()

        self.token_dict = None
        self.bins = 50
Beispiel #4
0
    def test_senna_chunk_tagger(self):
        chktagger = SennaChunkTagger(SENNA_EXECUTABLE_PATH)
        result_1 = chktagger.tag("What is the airspeed of an unladen swallow ?".split())
        expected_1 = [
            ("What", "B-NP"),
            ("is", "B-VP"),
            ("the", "B-NP"),
            ("airspeed", "I-NP"),
            ("of", "B-PP"),
            ("an", "B-NP"),
            ("unladen", "I-NP"),
            ("swallow", "I-NP"),
            ("?", "O"),
        ]

        result_2 = list(chktagger.bio_to_chunks(result_1, chunk_type="NP"))
        expected_2 = [
            ("What", "0"),
            ("the airspeed", "2-3"),
            ("an unladen swallow", "5-6-7"),
        ]
        self.assertEqual(result_1, expected_1)
        self.assertEqual(result_2, expected_2)
 def __init__(self):
     '''
     if phrase_dict_json != None: extract the phrase features
     if subtype_flag = True, extract the features by sub parse_type
     if bioe_flag = True, use the BIOE tags
     '''
     self.features = ['pos', 'chunk', 'promptword', 'stopword', 'tf', 'rank', 'color']
     
     if 'pos' in self.features:
         self.pos_tagger = SennaTagger(global_params.sennadir)
     
     if 'chunk' in self.features:
         self.chunk_tagger = SennaChunkTagger(global_params.sennadir)
     
     self.sentences = []
     
     self.porter = PorterStemmer()
     
     self.token_dict = None
     self.bins = 50
Beispiel #6
0
import os, nltk
import re
from nltk.tag import SennaTagger, SennaChunkTagger
from nltk.tokenize import sent_tokenize

#Constants
SOURCE_DIR = '../data/annotated/'
SENNA_INPUT_DIR_RESPS = '../data/senna_input_resps/'
SENNA_INPUT_DIR_SENTS = '../data/senna_input_sents/'
SENNA_DEST_DIR = '../data/senna_wordlist/'
SENNA_EXECUTABLE_DIR = '../../tools/senna'
"""
for now these taggers are not used. SENNA tagging is done mannually using a shell script
"""
pos_tagger = SennaTagger(SENNA_EXECUTABLE_DIR)
chunk_tagger = SennaChunkTagger(SENNA_EXECUTABLE_DIR)


def add_space_between_sentences(text):
    """
    Add space between sentences where no space is added after period
    """
    space_added_txt = re.sub(r"(\w+)\.(\w+)", r"\1. \2", text)
    return space_added_txt


def add_space_between_sentence_and_period(text, text_type):
    """
    Add space between sentence and period.
    This is needed for SENNA to tokenize sentences.
    text_type:  "single" for single sentence, 
Beispiel #7
0
import pickle
import nltk
from nltk.tag import SennaChunkTagger
from nltk.tag import StanfordNERTagger

filep = open('out', 'rb')
mydict = pickle.load(filep)
filep.close()

lines = []
filet = open('input', 'r')
for line in filet:
    lines.append(nltk.word_tokenize(line))

chunks = []
tagger = SennaChunkTagger('/home/senna')
for line in lines:
    chunks.append(tagger.tag(line))

flatchunks = [val for sublist in chunks for val in sublist]

entities = []
st = StanfordNERTagger(
    '/home/stanford-ner/classifiers/english.muc.7class.distsim.crf.ser.gz',
    '/home/stanford-ner/stanford-ner.jar',
    encoding='utf-8')
for line in lines:
    entities.append(st.tag(line))

flatentities = [val for sublist in entities for val in sublist]
                if 'A1' == arg:
                    EventStructures['Who'] = text
                elif 'A2' == arg:
                    EventStructures['Whom'] = text
                text = labels[i][1][0]
                Args.append(text)
        else:
            text += ' ' + labels[i][1][0]

    print(EventStructures)
    return Args


srltagger = SennaSRLTagger(path)
nertagger = SennaNERTagger(path)
chktagger = SennaChunkTagger(path)
tagger = SennaTagger(path)

#w = s.tag("Are you studying here?".split())
#w = s.tag("""A general interface to the SENNA pipeline that supports any of the operations specified in SUPPORTED OPERATIONS..""".split())

#print(tagger.tag(sents))
#print('\n___________________\n')
#print(chktagger.tag(sents))
#print('\n___________________\n')
#print(nertagger.tag(sents))
#print('\n___________________\n')
#print(srltagger.tag(sents))
#print('\n___________________\n')
#text = sent
NE_Tagger(text)
Beispiel #9
0
    def __init__(self, embedder, tag_vocab, ner_vocab, pos_vocab, sess=None):

        # check gpu
        if not check_gpu_existence():
            raise RuntimeError('Ontonotes NER model requires GPU with cuDNN!')

        n_hidden = (256, 256, 256)
        token_embeddings_dim = 100
        n_tags = len(tag_vocab)

        # Create placeholders
        x_word = tf.placeholder(dtype=tf.float32,
                                shape=[None, None, token_embeddings_dim],
                                name='x_word')
        x_char = tf.placeholder(dtype=tf.int32,
                                shape=[None, None, None],
                                name='x_char')

        # Features
        x_pos = tf.placeholder(dtype=tf.float32,
                               shape=[None, None, len(pos_vocab)],
                               name='x_pos')  # Senna
        x_ner = tf.placeholder(dtype=tf.float32,
                               shape=[None, None, len(ner_vocab)],
                               name='x_ner')  # Senna
        x_capi = tf.placeholder(dtype=tf.float32,
                                shape=[None, None],
                                name='x_capi')

        y_true = tf.placeholder(dtype=tf.int32,
                                shape=[None, None],
                                name='y_tag')
        mask = tf.placeholder(dtype=tf.float32,
                              shape=[None, None],
                              name='mask')
        sequence_lengths = tf.reduce_sum(mask, axis=1)

        # Concat features to embeddings
        emb = tf.concat(
            [x_word, tf.expand_dims(x_capi, 2), x_pos, x_ner], axis=2)

        # The network
        units = emb
        for n, n_h in enumerate(n_hidden):
            with tf.variable_scope('RNN_' + str(n)):
                units, _ = cudnn_bi_lstm(units, n_h,
                                         tf.to_int32(sequence_lengths))

        # Classifier
        with tf.variable_scope('Classifier'):
            units = tf.layers.dense(units,
                                    n_hidden[-1],
                                    kernel_initializer=xavier_initializer())
            logits = tf.layers.dense(units,
                                     n_tags,
                                     kernel_initializer=xavier_initializer())

        # CRF
        _, trainsition_params = tf.contrib.crf.crf_log_likelihood(
            logits, y_true, sequence_lengths)

        # Initialize session
        if sess is None:
            sess = tf.Session()

        self._ner_tagger = SennaNERTagger('download/senna/')
        self._pos_tagger = SennaChunkTagger('download/senna/')

        self._x_w = x_word
        self._x_c = x_char
        self._x_capi = x_capi
        self.x_pos = x_pos
        self.x_ner = x_ner
        self._y_true = y_true
        self._mask = mask
        self._sequence_lengths = sequence_lengths
        self._token_embeddings_dim = token_embeddings_dim

        self._pos_dict = pos_vocab
        self._ner_dict = ner_vocab
        self._tag_dict = tag_vocab

        self._logits = logits
        self._trainsition_params = trainsition_params

        self._sess = sess
        sess.run(tf.global_variables_initializer())
        self._embedder = embedder
import sys
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.tag import SennaTagger,SennaChunkTagger

f=open(sys.argv[1])
lines=sent_tokenize(f.read())
words=word_tokenize(lines[0])
chunk_tagger=SennaChunkTagger('/usr/share/senna-v2.0')
ch=chunk_tagger.tag(words)

for words in ch:    
    if "B-" in words[1]:
        B=("\n"+words[1].strip("B-")+" "+words[0])
        print(B,end="")
    else:
        print(" "+words[0],end="")
 
Beispiel #11
0
# -*-coding: utf-8 -*-
'''
Created on 17 Mar 2016

@author: BurakKerim
'''

from nltk.tokenize import WordPunctTokenizer
from nltk.tag import SennaChunkTagger

senna_path = '/media/burak/Data/Workspace/Library/senna'

chunker = SennaChunkTagger(senna_path)
tokenizer = WordPunctTokenizer()


def tokenize(sent):
    return tokenizer.tokenize(sent.lower())


def chunk(sent):
    return chunker.tag(tokenize(sent))


sentence = 'This sentence is a test sentence for test in a test environment.'

print(tokenize(sentence))
print(chunk(sentence))
Beispiel #12
0
import sys
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import SennaChunkTagger

Ctagger = SennaChunkTagger('/usr/share/senna-v2.0')
argv = sys.argv

sent_tokenized = sent_tokenize(open(argv[1]).read())
word_tokenized = word_tokenize(sent_tokenized[0])

for a,b in Ctagger.tag(word_tokenized):
    print(b,"\11",a)
import sys
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tag import SennaChunkTagger

tagger = SennaChunkTagger('/usr/share/senna-v2.0')

argv = sys.argv
f = open(argv[1], 'r').read()
sentences = sent_tokenize(f)

word = word_tokenize(sentences[0])

chunk_list = []
chunk = tagger.tag(word)

for i in range(len(chunk)):
    chunk_list.append(chunk[i][1].split('-'))

for j in range(len(chunk_list)):
    if(chunk_list[j][0] == 'O'):
        if(chunk_list[j - 1][0] != 'O'):
            print(chunk_list[j - 1][1])
            
        else:
            pass
        
    elif(j == 0 or chunk_list[j - 1][0] == 'O'):
            print(chunk[j][0], end = ' ')
    elif( j != 0):
            if(chunk_list[j][1] == chunk_list[j - 1][1]):
                print(chunk[j][0], end = ' ')
Beispiel #14
0
class NerNetwork:
    def __init__(self, embedder, tag_vocab, ner_vocab, pos_vocab, sess=None):

        # check gpu
        if not check_gpu_existence():
            raise RuntimeError('Ontonotes NER model requires GPU with cuDNN!')

        n_hidden = (256, 256, 256)
        token_embeddings_dim = 100
        n_tags = len(tag_vocab)

        # Create placeholders
        x_word = tf.placeholder(dtype=tf.float32,
                                shape=[None, None, token_embeddings_dim],
                                name='x_word')
        x_char = tf.placeholder(dtype=tf.int32,
                                shape=[None, None, None],
                                name='x_char')

        # Features
        x_pos = tf.placeholder(dtype=tf.float32,
                               shape=[None, None, len(pos_vocab)],
                               name='x_pos')  # Senna
        x_ner = tf.placeholder(dtype=tf.float32,
                               shape=[None, None, len(ner_vocab)],
                               name='x_ner')  # Senna
        x_capi = tf.placeholder(dtype=tf.float32,
                                shape=[None, None],
                                name='x_capi')

        y_true = tf.placeholder(dtype=tf.int32,
                                shape=[None, None],
                                name='y_tag')
        mask = tf.placeholder(dtype=tf.float32,
                              shape=[None, None],
                              name='mask')
        sequence_lengths = tf.reduce_sum(mask, axis=1)

        # Concat features to embeddings
        emb = tf.concat(
            [x_word, tf.expand_dims(x_capi, 2), x_pos, x_ner], axis=2)

        # The network
        units = emb
        for n, n_h in enumerate(n_hidden):
            with tf.variable_scope('RNN_' + str(n)):
                units, _ = cudnn_bi_lstm(units, n_h,
                                         tf.to_int32(sequence_lengths))

        # Classifier
        with tf.variable_scope('Classifier'):
            units = tf.layers.dense(units,
                                    n_hidden[-1],
                                    kernel_initializer=xavier_initializer())
            logits = tf.layers.dense(units,
                                     n_tags,
                                     kernel_initializer=xavier_initializer())

        # CRF
        _, trainsition_params = tf.contrib.crf.crf_log_likelihood(
            logits, y_true, sequence_lengths)

        # Initialize session
        if sess is None:
            sess = tf.Session()

        self._ner_tagger = SennaNERTagger('download/senna/')
        self._pos_tagger = SennaChunkTagger('download/senna/')

        self._x_w = x_word
        self._x_c = x_char
        self._x_capi = x_capi
        self.x_pos = x_pos
        self.x_ner = x_ner
        self._y_true = y_true
        self._mask = mask
        self._sequence_lengths = sequence_lengths
        self._token_embeddings_dim = token_embeddings_dim

        self._pos_dict = pos_vocab
        self._ner_dict = ner_vocab
        self._tag_dict = tag_vocab

        self._logits = logits
        self._trainsition_params = trainsition_params

        self._sess = sess
        sess.run(tf.global_variables_initializer())
        self._embedder = embedder

    def load(self, model_file_path):
        saver = tf.train.Saver(tf.trainable_variables())
        saver.restore(self._sess, model_file_path)

    @staticmethod
    def to_one_hot(x, n):
        b = np.zeros([len(x), n], dtype=np.float32)
        for q, tok in enumerate(x):
            b[q, tok] = 1
        return b

    def tokens_batch_to_numpy_batch(self, batch_x):
        """ Convert a batch of tokens to numpy arrays of features"""
        x = dict()
        batch_size = len(batch_x)
        max_utt_len = max([len(utt) for utt in batch_x])

        # Embeddings
        x['emb'] = self._embedder(batch_x)

        # Capitalization
        x['capitalization'] = np.zeros([batch_size, max_utt_len],
                                       dtype=np.float32)
        for n, utt in enumerate(batch_x):
            x['capitalization'][n, :len(utt)] = [
                tok[0].isupper() for tok in utt
            ]

        # POS
        n_pos = len(self._pos_dict)
        x['pos'] = np.zeros([batch_size, max_utt_len, n_pos])
        for n, utt in enumerate(batch_x):
            token_tag_pairs = self._pos_tagger.tag(utt)
            pos_tags = list(zip(*token_tag_pairs))[1]
            pos = np.array([self._pos_dict[p] for p in pos_tags])
            pos = self.to_one_hot(pos, n_pos)
            x['pos'][n, :len(pos)] = pos

        # NER
        n_ner = len(self._ner_dict)
        x['ner'] = np.zeros([batch_size, max_utt_len, n_ner])
        for n, utt in enumerate(batch_x):
            token_tag_pairs = self._ner_tagger.tag(utt)
            ner_tags = list(zip(*token_tag_pairs))[1]
            ner = np.array([self._ner_dict[p] for p in ner_tags])
            ner = self.to_one_hot(ner, n_ner)
            x['ner'][n, :len(ner)] = ner

        # Mask for paddings
        x['mask'] = np.zeros([batch_size, max_utt_len], dtype=np.float32)
        for n in range(batch_size):
            x['mask'][n, :len(batch_x[n])] = 1

        return x

    def train_on_batch(self, x_word, x_char, y_tag):
        raise NotImplementedError

    def predict(self, x):
        feed_dict = self._fill_feed_dict(x)
        y_pred = []
        logits, trans_params, sequence_lengths = self._sess.run(
            [self._logits, self._trainsition_params, self._sequence_lengths],
            feed_dict=feed_dict)

        # iterate over the sentences because no batching in viterbi_decode
        for logit, sequence_length in zip(logits, sequence_lengths):
            logit = logit[:int(sequence_length)]  # keep only the valid steps
            viterbi_seq, viterbi_score = tf.contrib.crf.viterbi_decode(
                logit, trans_params)
            y_pred += [viterbi_seq]

        pred = []
        batch_size = len(x['emb'])
        for n in range(batch_size):
            pred.append([self._tag_dict[tag] for tag in y_pred[n]])
        return pred

    def predict_on_batch(self, tokens_batch):
        batch_x = self.tokens_batch_to_numpy_batch(tokens_batch)
        # Prediction indices
        predictions_batch = self.predict(batch_x)
        predictions_batch_no_pad = list()
        for n, predicted_tags in enumerate(predictions_batch):
            predictions_batch_no_pad.append(
                predicted_tags[:len(tokens_batch[n])])
        return predictions_batch_no_pad

    def _fill_feed_dict(self, x):

        feed_dict = dict()
        feed_dict[self._x_w] = x['emb']
        feed_dict[self._mask] = x['mask']

        feed_dict[self.x_pos] = x['pos']
        feed_dict[self.x_ner] = x['ner']

        feed_dict[self._x_capi] = x['capitalization']
        return feed_dict
#define var
cnt = 0  #counter
flist = []  #file list
linked_file = ""  #linked file
ifile = ""  #input file data
splited_file = []  #splited file
taged_file = []  #taged file
f = ""  #filename

import os.path
import nltk

#import senna chk taggar
from nltk.tag import SennaChunkTagger

chktagger = SennaChunkTagger('/usr/share/senna-v3.0')

#loop
while True:
    #import data
    cnt += 1
    ifile = input("please inputfile" + str(cnt) +
                  "(e to end input / q to quit) : ")

    #escape from loop
    if ifile == "e":
        break

    elif ifile == "q":
        quit()
class CRF_Extractor:
    '''
    extract features for the CRF model
    each line is a feature vector for a token
    '''
    def __init__(self):
        '''
        if phrase_dict_json != None: extract the phrase features
        if subtype_flag = True, extract the features by sub parse_type
        if bioe_flag = True, use the BIOE tags
        '''
        self.features = ['pos', 'chunk', 'promptword', 'stopword', 'tf', 'rank', 'color']
        
        if 'pos' in self.features:
            self.pos_tagger = SennaTagger(global_params.sennadir)
        
        if 'chunk' in self.features:
            self.chunk_tagger = SennaChunkTagger(global_params.sennadir)
        
        self.sentences = []
        
        self.porter = PorterStemmer()
        
        self.token_dict = None
        self.bins = 50
    
    def add_sentence(self, sentence):
        self.sentences.append(sentence)
    
    def get_token_tf(self):
        self.token_dict = defaultdict(float)
        
        for tokens, _, _ in self.sentences:
            for token in self.porter.stem_tokens(tokens):
                self.token_dict[token] += 1.0
        
        self.rank_dict = defaultdict(int)
        rank_tokens = sorted(self.token_dict, key=self.token_dict.get, reverse=True)
        
        self.rank_dict = defaultdict(int)
        for i, token in enumerate(rank_tokens):
            self.rank_dict[token] = int(i*10/len(rank_tokens))
        
        for t, v in self.token_dict.items(): #normalized by the number of sentences
            x = v/len(self.sentences)
            if x > 1.0: x = 1.0
            
            self.token_dict[t] = x
        
    def get_feature_names(self):
        return '_'.join(self.features)
    
    def get_i_j(self, body, i, j):
        '''
        return the value of the crf template feature u[i, j]
        intput: 
            body: [][], two-dimentionary array, representing the crf features for a sentence
            i: int, the index of i
            j: int, the index of j
        '''
        n = len(body)
        if i < 0:
            v = '_x%d'%(i)
        elif i >= n:
            v = '_x+%d'%(i-n+1)
        else:
            v = body[i][j]
        return v
    
    def extract_U_i_j(self, data_body, feature_body, i, j, tag):
        '''
        extract the U[i, j] feature, and add it to the end of each row
        intput: 
            data_body: [][], two-dimentionary array, representing the crf data for a sentence
            feature_body: [][], two-dimentionary array, the resulting feature data for a sentence
            i: int, the index of i
            j: int, the index of j
            tag: the prefix of the feature name
        '''
        for k, row in enumerate(feature_body):
            row.append('%s:%s'%(tag, self.get_i_j(data_body, k+i, j)))
    
    def extract_U_i_j_m_n(self, data_body, feature_body, i, j, m, n, tag):
        '''
        extract the U[i, j]/U[m, n] feature, and add it to the end of each row
        '''
        for k, row in enumerate(feature_body):
            row.append('%s:%s/%s'%(tag, self.get_i_j(data_body, k+i, j), self.get_i_j(data_body, k+m, n)))
    
    def extract_U_i_j_m_n_x_y(self, data_body, feature_body, i, j, m, n, x, y, tag):
        '''
        extract the U[i, j]/U[m, n]/U[x,y] feature, and add it to the end of each row
        '''
        for k, row in enumerate(feature_body):
            row.append('%s:%s/%s/%s'%(tag, self.get_i_j(data_body, k+i, j), self.get_i_j(data_body, k+m, n), self.get_i_j(data_body, k+x, y)))
    
    def extract_word_U_i_j(self, data_body, index, feature_body, i, j, tag):
        '''
        extract the U[i, j] feature, and add it to the end of each row
        '''
        for k, row in zip(index, feature_body):
            row.append('%s:%s'%(tag, self.get_i_j(data_body, k+i, j)))
    
    def extract_word_U_i_j_m_n(self, data_body, index, feature_body, i, j, m, n, tag):
        '''
        extract the U[i, j]/U[m, n] feature, and add it to the end of each row
        '''
        for k, row in zip(index, feature_body):
            row.append('%s:%s/%s'%(tag, self.get_i_j(data_body, k+i, j), self.get_i_j(data_body, k+m, n)))
    
    def extract_word_U_i_j_m_n_x_y(self, data_body, index, feature_body, i, j, m, n, x, y, tag):
        '''
        extract the U[i, j]/U[m, n] feature, and add it to the end of each row
        '''
        for k, row in zip(index, feature_body):
            row.append('%s:%s/%s/%s'%(tag, self.get_i_j(data_body, k+i, j), self.get_i_j(data_body, k+m, n), self.get_i_j(data_body, k+x, y)))
            
    def extract_bigram(self, body):
        '''
        extract the bigram feature for the crf template
        '''
        for row in body:
            row.append('b')
    
    def extract_crf_features(self, tokens, tags, prompt, colors=None):
        '''
        Extract the character features, each token a line
        return: [][], two dimentionary array, representing the feature data of the sentence
        '''
    
        body = []

        words = tokens
        N = len(tokens)
        
        #first row: the word token
        for word in words:
            row = []
            row.append(word)
            body.append(row)
        
        if 'pos' in self.features:
            pos_tags = self.pos_tagger.tag(tokens)
            
            for i, (_, p_tag) in enumerate(pos_tags):
                body[i].append(p_tag)
        
        if 'chunk' in self.features:
            chunk_tags = self.chunk_tagger.tag(tokens)
            
            for i, (_, c_tag) in enumerate(chunk_tags):
                body[i].append(c_tag)
        
        if 'promptword' in self.features:
            for i, token in enumerate(tokens):
                if token in prompt_words[prompt]:
                    body[i].append('Y')
                else:
                    body[i].append('N')
        
        if 'stopword' in self.features:
            for i, token in enumerate(tokens):
                if token in stopwords:
                    body[i].append('Y')
                else:
                    body[i].append('N')
        
        if 'tf' in self.features:
            if self.token_dict == None:
                self.get_token_tf()
            
            for i, token in enumerate(self.porter.stem_tokens(tokens)):
                assert(token in self.token_dict)
                
                x = int(self.token_dict[token]*self.bins)
                body[i].append(str(x))
        
        if 'rank' in self.features:
            if self.rank_dict == None:
                self.get_token_tf()
            
            for i, token in enumerate(self.porter.stem_tokens(tokens)):
                assert(token in self.rank_dict)
                
                x = self.rank_dict[token]
                body[i].append(str(x))        
        
        if 'color' in self.features and colors != None:
            for color in colors:
                for i, tag in enumerate(tags):
                    body[i].append(str(color[i]))
        
        #last row:
        tags = [tag for tag in tags]
        
        for i, tag in enumerate(tags):
            body[i].append(tag)
        
        return body
import sys
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.tag import SennaTagger,SennaChunkTagger

ctagger = SennaChunkTagger('/usr/share/senna-v2.0')
#辞書形式のオブジェクト定義

#sys.argv[1]で指定されたファイルを読み込み
#文全体をリスト化、最初の文のみにする(sent_tokenize[0])

first_sentence = sent_tokenize(open(sys.argv[1],'r').read())[0]
#first_sentence = word_tokenize(sent_tokenize(open(sys.argv[1],'r').read())[0])

#1文目をタグ付けした単語リストにする
taglist = ctagger.tag(first_sentence.split())

buff1=""

#taglistの単語とタグでループ
for word,tag in taglist:
    if "B-" in tag:
        if buff1 !="":
            #同じtabを持つwordと、tagの3文字目以降を表示
            print(buff1, buff2[2:])
        buff1=word
        buff2=tag
    elif "I-" in tag:
        buff1 = buff1 + " " + word
import sys
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import SennaChunkTagger

Ctagger = SennaChunkTagger('/usr/share/senna-v2.0')

#読み込んだテキストを文分割
sent_tokenized = sent_tokenize(open(sys.argv[1]).read())

#第1文だけ単語分割
word_tokenized = word_tokenize(sent_tokenized[0])

nlFlg = False #改行フラグ
for word, tag in Ctagger.tag(word_tokenized):
    if "B-" in tag:
        if nlFlg == True:
            print()
        else:
            nlFlg = True

        #句の種類と句の先頭の単語を表示
        print(str(tag).lstrip("B-"),"\11",str(word),end=" ")

    elif "I-" in tag:
        print(str(word),end=" ")
        nlFlg = True
import sys
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tag import SennaChunkTagger

ctagger = SennaChunkTagger('/usr/share/senna-v2.0')

fr = open(sys.argv[1]).read()
sent = sent_tokenize(fr)

aword = word_tokenize(sent[0])

print(ctagger.tag(aword))

for word, tag in ctagger.tag(aword):
    if "B-" in tag:
        print()
        print(str(tag).replace("B-",""), word, end="")

    elif "I-" in tag:
        print("", word, end="")
Beispiel #20
0
class CRF_Extractor:
    '''
    extract features for the CRF model
    each line is a feature vector for a token
    '''
    def __init__(self):
        '''
        if phrase_dict_json != None: extract the phrase features
        if subtype_flag = True, extract the features by sub parse_type
        if bioe_flag = True, use the BIOE tags
        '''
        self.features = [
            'pos', 'chunk', 'promptword', 'stopword', 'tf', 'rank', 'color'
        ]

        if 'pos' in self.features:
            self.pos_tagger = SennaTagger(global_params.sennadir)

        if 'chunk' in self.features:
            self.chunk_tagger = SennaChunkTagger(global_params.sennadir)

        self.sentences = []

        self.porter = PorterStemmer()

        self.token_dict = None
        self.bins = 50

    def add_sentence(self, sentence):
        self.sentences.append(sentence)

    def get_token_tf(self):
        self.token_dict = defaultdict(float)

        for tokens, _, _ in self.sentences:
            for token in self.porter.stem_tokens(tokens):
                self.token_dict[token] += 1.0

        self.rank_dict = defaultdict(int)
        rank_tokens = sorted(self.token_dict,
                             key=self.token_dict.get,
                             reverse=True)

        self.rank_dict = defaultdict(int)
        for i, token in enumerate(rank_tokens):
            self.rank_dict[token] = int(i * 10 / len(rank_tokens))

        for t, v in self.token_dict.items(
        ):  #normalized by the number of sentences
            x = v / len(self.sentences)
            if x > 1.0: x = 1.0

            self.token_dict[t] = x

    def get_feature_names(self):
        return '_'.join(self.features)

    def get_i_j(self, body, i, j):
        '''
        return the value of the crf template feature u[i, j]
        intput: 
            body: [][], two-dimentionary array, representing the crf features for a sentence
            i: int, the index of i
            j: int, the index of j
        '''
        n = len(body)
        if i < 0:
            v = '_x%d' % (i)
        elif i >= n:
            v = '_x+%d' % (i - n + 1)
        else:
            v = body[i][j]
        return v

    def extract_U_i_j(self, data_body, feature_body, i, j, tag):
        '''
        extract the U[i, j] feature, and add it to the end of each row
        intput: 
            data_body: [][], two-dimentionary array, representing the crf data for a sentence
            feature_body: [][], two-dimentionary array, the resulting feature data for a sentence
            i: int, the index of i
            j: int, the index of j
            tag: the prefix of the feature name
        '''
        for k, row in enumerate(feature_body):
            row.append('%s:%s' % (tag, self.get_i_j(data_body, k + i, j)))

    def extract_U_i_j_m_n(self, data_body, feature_body, i, j, m, n, tag):
        '''
        extract the U[i, j]/U[m, n] feature, and add it to the end of each row
        '''
        for k, row in enumerate(feature_body):
            row.append('%s:%s/%s' % (tag, self.get_i_j(
                data_body, k + i, j), self.get_i_j(data_body, k + m, n)))

    def extract_U_i_j_m_n_x_y(self, data_body, feature_body, i, j, m, n, x, y,
                              tag):
        '''
        extract the U[i, j]/U[m, n]/U[x,y] feature, and add it to the end of each row
        '''
        for k, row in enumerate(feature_body):
            row.append('%s:%s/%s/%s' % (tag, self.get_i_j(
                data_body, k + i, j), self.get_i_j(
                    data_body, k + m, n), self.get_i_j(data_body, k + x, y)))

    def extract_word_U_i_j(self, data_body, index, feature_body, i, j, tag):
        '''
        extract the U[i, j] feature, and add it to the end of each row
        '''
        for k, row in zip(index, feature_body):
            row.append('%s:%s' % (tag, self.get_i_j(data_body, k + i, j)))

    def extract_word_U_i_j_m_n(self, data_body, index, feature_body, i, j, m,
                               n, tag):
        '''
        extract the U[i, j]/U[m, n] feature, and add it to the end of each row
        '''
        for k, row in zip(index, feature_body):
            row.append('%s:%s/%s' % (tag, self.get_i_j(
                data_body, k + i, j), self.get_i_j(data_body, k + m, n)))

    def extract_word_U_i_j_m_n_x_y(self, data_body, index, feature_body, i, j,
                                   m, n, x, y, tag):
        '''
        extract the U[i, j]/U[m, n] feature, and add it to the end of each row
        '''
        for k, row in zip(index, feature_body):
            row.append('%s:%s/%s/%s' % (tag, self.get_i_j(
                data_body, k + i, j), self.get_i_j(
                    data_body, k + m, n), self.get_i_j(data_body, k + x, y)))

    def extract_bigram(self, body):
        '''
        extract the bigram feature for the crf template
        '''
        for row in body:
            row.append('b')

    def extract_crf_features(self, tokens, tags, prompt, colors=None):
        '''
        Extract the character features, each token a line
        return: [][], two dimentionary array, representing the feature data of the sentence
        '''

        body = []

        words = tokens
        N = len(tokens)

        #first row: the word token
        for word in words:
            row = []
            row.append(word)
            body.append(row)

        if 'pos' in self.features:
            pos_tags = self.pos_tagger.tag(tokens)

            for i, (_, p_tag) in enumerate(pos_tags):
                body[i].append(p_tag)

        if 'chunk' in self.features:
            chunk_tags = self.chunk_tagger.tag(tokens)

            for i, (_, c_tag) in enumerate(chunk_tags):
                body[i].append(c_tag)

        if 'promptword' in self.features:
            for i, token in enumerate(tokens):
                if token in prompt_words[prompt]:
                    body[i].append('Y')
                else:
                    body[i].append('N')

        if 'stopword' in self.features:
            for i, token in enumerate(tokens):
                if token in stopwords:
                    body[i].append('Y')
                else:
                    body[i].append('N')

        if 'tf' in self.features:
            if self.token_dict == None:
                self.get_token_tf()

            for i, token in enumerate(self.porter.stem_tokens(tokens)):
                assert (token in self.token_dict)

                x = int(self.token_dict[token] * self.bins)
                body[i].append(str(x))

        if 'rank' in self.features:
            if self.rank_dict == None:
                self.get_token_tf()

            for i, token in enumerate(self.porter.stem_tokens(tokens)):
                assert (token in self.rank_dict)

                x = self.rank_dict[token]
                body[i].append(str(x))

        if 'color' in self.features and colors != None:
            for color in colors:
                for i, tag in enumerate(tags):
                    body[i].append(str(color[i]))

        #last row:
        tags = [tag for tag in tags]

        for i, tag in enumerate(tags):
            body[i].append(tag)

        return body
import sys
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tag import SennaChunkTagger
chktagger = SennaChunkTagger('/usr/share/senna-v2.0')
argv = sys.argv
text = open(argv[1] ,'r').read()
sentence = sent_tokenize(text)
count = 0
word2 = []
for part in sentence:
    count += 1
    if count < 2:
        word = word_tokenize(part)
word1 = chktagger.tag(word)
for i in range(len(word1)):
    if "-" in word1[i][1]:
        word2 = word2 + [[word1[i][0],word1[i][1].split("-")]]
    else:
        word2 = word2 + [[word1[i][0],[word1[i][1],word1[i][1]]]]
for i in range(len(word2)):
    if i == 0:
        print(word2[i][0]+" ", end="")
    else:
        if (word2[i][1][1] == word2[i-1][1][1] and word2[i][1][1] != "O"):
            print(word2[i][0]+" ", end="")
        elif (word2[i][1][1] != word2[i-1][1][1]):
            if(word2[i][1][1] == "O"):
                print(" "+word2[i-1][1][1])
            elif(word2[i-1][1][1] == "O"):
                print(word2[i][0]+" ", end="")
            else:
#-*- coding: utf-8 -*-

import sys
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tag import SennaChunkTagger

argvs = sys.argv
argc = len(argvs)

#引数が不適切の場合はメッセージを表示する
if (argc != 2):
        print('Usage: # python %s filename' % argvs[0])
        quit()
#タガー準備
tagger = SennaChunkTagger('/usr/share/senna-v2.0')

#文分割
openedFile = open(argvs[1]).read()
sent_tokenize_list = sent_tokenize(openedFile)


#1行目の単語分割
word_tokenize_list = word_tokenize(sent_tokenize_list[0])

#タグ付け

tag_list = []
tagged_sent = tagger.tag(word_tokenize_list)

for i in range(len(tagged_sent)):
Beispiel #23
0
import os
import nltk
from nltk.tag import SennaChunkTagger

#print os.environ['SENNA']

chktagger = SennaChunkTagger(os.environ['SENNA'])


sentences = [
  'Will String.trim() remove all spaces on these sides or just one space on each?',
  'If no such object exists, the map should be "wrapped" using the Collections.synchronizedMap method.'
]

with open('train.txt', 'w') as f:
  for s in sentences:
    tokens = nltk.word_tokenize(s)
    pos_tag = nltk.pos_tag(tokens)
    chunk_tag = chktagger.tag(tokens)
    #print chunk_tag

    for index, token in enumerate(tokens):
      f.write('O %s %s %s \n' % ( token, pos_tag[index][1], chunk_tag[index][1] ))

    f.write('\n')

  f.write('\n')