def test_senna_tagger(self): tagger = SennaTagger(SENNA_EXECUTABLE_PATH) result = tagger.tag('What is the airspeed of an unladen swallow ?'.split()) expected = [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'),('of', 'IN'), ('an', 'DT'), ('unladen', 'NN'), ('swallow', 'NN'), ('?', '.')] self.assertEqual(result, expected)
def test_senna_tagger(self): tagger = SennaTagger(SENNA_EXECUTABLE_PATH) result = tagger.tag( 'What is the airspeed of an unladen swallow ?'.split()) expected = [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'NN'), ('swallow', 'NN'), ('?', '.')] self.assertEqual(result, expected)
def test_senna_tagger(self): tagger = SennaTagger(SENNA_EXECUTABLE_PATH) result = tagger.tag("What is the airspeed of an unladen swallow ?".split()) expected = [ ("What", "WP"), ("is", "VBZ"), ("the", "DT"), ("airspeed", "NN"), ("of", "IN"), ("an", "DT"), ("unladen", "NN"), ("swallow", "NN"), ("?", "."), ] self.assertEqual(result, expected)
def __init__(self, config): ''' learning_algorithm -- the algorithm to train with (default "SVM") ''' self.training_file = config.get('model', 'train') self.learning_algorithm = config.get('model', 'classify') self.features = config.get('model', 'features').split(',') #print self.features self.course = config.get('model', 'course') self.test_file = '../data/' + self.course + '.json' self._model = None if 'pos' in self.features: self.tagger = SennaTagger(config.get('model', 'senna')) if 'content' in self.features: self.contentwords = [line.strip().lower() for line in open(config.get('model', 'content')).readlines()] if 'organization' in self.features: self.orgnizationwords = [line.strip().lower() for line in open(config.get('model', "organization")).readlines()] featuresets = self._get_training_data() self._train_classifier_model(featuresets)
class Senna(): '''SennaTagger''' def __init__(self, path='senna', **kwargs): self.__dict__.update(kwargs) if not os.path.isabs(path): current_dir = os.path.dirname(os.path.abspath(__file__)) path = os.path.join(current_dir, path) paths = ( path, os.path.join(sys.exec_prefix, r'lib\site-packages', 'senna'), os.path.join(MODULEDIR, 'bin', 'senna') ) for path in paths: if os.path.exists(path): break else: raise FileNotFoundError(paths) self.tagger = SennaTagger(path, **kwargs) def __call__(self, tokens): return self.tagger.tag(tokens)
def __init__(self): ''' if phrase_dict_json != None: extract the phrase features if subtype_flag = True, extract the features by sub parse_type if bioe_flag = True, use the BIOE tags ''' self.features = [ 'pos', 'chunk', 'promptword', 'stopword', 'tf', 'rank', 'color' ] if 'pos' in self.features: self.pos_tagger = SennaTagger(global_params.sennadir) if 'chunk' in self.features: self.chunk_tagger = SennaChunkTagger(global_params.sennadir) self.sentences = [] self.porter = PorterStemmer() self.token_dict = None self.bins = 50
def __init__(self): ''' if phrase_dict_json != None: extract the phrase features if subtype_flag = True, extract the features by sub parse_type if bioe_flag = True, use the BIOE tags ''' self.features = ['pos', 'chunk', 'promptword', 'stopword', 'tf', 'rank', 'color'] if 'pos' in self.features: self.pos_tagger = SennaTagger(global_params.sennadir) if 'chunk' in self.features: self.chunk_tagger = SennaChunkTagger(global_params.sennadir) self.sentences = [] self.porter = PorterStemmer() self.token_dict = None self.bins = 50
class Tagger: """Tag words' part of speech in sentences.""" def __init__(self): self.tagger = SennaTagger('/app/util/senna') def tag(self, tokens_in_sentence): """Tag tokens in sentences with their part of speech. :param tokens_in_sentence: list of tokens, grouped by sentence. :return: iterator of lists with words. """ return ((Word(t, self._parse_POS(POS)) for t, POS in s) for s in self.tagger.tag_sents(tokens_in_sentence)) def _parse_POS(self, tag): if tag.startswith('J'): return PartOfSpeach.ADJECTIVE elif tag.startswith('V'): return PartOfSpeach.VERB elif tag.startswith('N'): return PartOfSpeach.NOUN elif tag.startswith('R'): return PartOfSpeach.ADVERB return PartOfSpeach.OTHER
EventStructures['Who'] = text elif 'A2' == arg: EventStructures['Whom'] = text text = labels[i][1][0] Args.append(text) else: text += ' ' + labels[i][1][0] print(EventStructures) return Args srltagger = SennaSRLTagger(path) nertagger = SennaNERTagger(path) chktagger = SennaChunkTagger(path) tagger = SennaTagger(path) #w = s.tag("Are you studying here?".split()) #w = s.tag("""A general interface to the SENNA pipeline that supports any of the operations specified in SUPPORTED OPERATIONS..""".split()) #print(tagger.tag(sents)) #print('\n___________________\n') #print(chktagger.tag(sents)) #print('\n___________________\n') #print(nertagger.tag(sents)) #print('\n___________________\n') #print(srltagger.tag(sents)) #print('\n___________________\n') #text = sent NE_Tagger(text) #print('\n'.join(str(e) for e in NE_Tagger(sents)))
#define var cnt = 0 #counter flist = [] #file list linked_file = "" #linked file ifile = "" #input file data splited_file = [] #splited file taged_file = [] #taged file f = "" #filename import os.path import nltk #import Senna Tagger from nltk.tag import SennaTagger tagger = SennaTagger('/usr/share/senna-v3.0') #loop while True: #import data cnt += 1 ifile = input("please inputfile" + str(cnt) + "(e to end input / q to quit) : ") #escape from loop if ifile == "e": break elif ifile == "q": quit()
class QualityPrediction: def __init__(self, config): ''' learning_algorithm -- the algorithm to train with (default "SVM") ''' self.training_file = config.get('model', 'train') self.learning_algorithm = config.get('model', 'classify') self.features = config.get('model', 'features').split(',') #print self.features self.course = config.get('model', 'course') self.test_file = '../data/' + self.course + '.json' self._model = None if 'pos' in self.features: self.tagger = SennaTagger(config.get('model', 'senna')) if 'content' in self.features: self.contentwords = [line.strip().lower() for line in open(config.get('model', 'content')).readlines()] if 'organization' in self.features: self.orgnizationwords = [line.strip().lower() for line in open(config.get('model', "organization")).readlines()] featuresets = self._get_training_data() self._train_classifier_model(featuresets) def evaluate(self): test_featureset = self._get_featuresets(self.test_file) labels = [int(x[1]) for x in test_featureset] featureset = [x[0] for x in test_featureset] predicts = [int(x) for x in self._model.classify_many(featureset)] metric = Metric() return metric.accuracy(labels, predicts), metric.kappa(labels, predicts), metric.QWkappa(labels, predicts) def get_features(self, text, cid, lecture): features = {} #unigram tokens = nltk.word_tokenize(text) if 'WC' in self.features: features['WC'] = len(tokens) if 'unigram' in self.features: for token in tokens: features['U0_'+token.lower()] = 1 if 'pos' in self.features: tags = self.tagger.tag(tokens) for _, tag in tags: features['P0_'+tag] = 1 if 'content' in self.features: hasContentWord = 0 for word in tokens: if word.lower() in self.contentwords: hasContentWord = 1 break features['C0_'] = hasContentWord if 'organization' in self.features: OrgAssign = 0 for word in tokens: if word.lower() in self.orgnizationwords: OrgAssign = 1 break features['O0_'] = OrgAssign return features def get_model(self): """An accessor method for the model.""" return self._model def _get_featuresets(self, input): featuresets = [] MPLectures = file_util.LoadDictJson(input) for week, MPs in enumerate(MPLectures): if MPs == []: continue for k, (MP, score) in enumerate(MPs): features = self.get_features(MP, week, 'Engineer') featuresets.append((features,score)) return featuresets def _get_training_data(self): """Builds and returns positive and negative feature sets for the algorithm """ featuresets = self._get_featuresets(self.training_file) return featuresets def _train_classifier_model(self, featuresets): """This changes the algorithm that nltk uses to train the model. Arguments: featuresets -- array of features generated for training """ model = None if(self.learning_algorithm == "NB"): model = nltk.NaiveBayesClassifier.train(featuresets) elif(self.learning_algorithm == "MaxEnt"): model = nltk.MaxentClassifier.train(featuresets, "MEGAM", max_iter=15) elif(self.learning_algorithm == "DecisionTree"): model = nltk.DecisionTreeClassifier.train(featuresets, 0.05) elif(self.learning_algorithm == 'SVM'): model = SklearnClassifier(SVC(kernel='linear')).train(featuresets) self._model = model def predict(self, text, cid=None, lecture=None): features = self.get_features(text, cid, lecture) return self._model.classify(features)
import sys from nltk.tokenize import word_tokenize, sent_tokenize from nltk.tag import SennaTagger argv = sys.argv sent_tokenized = sent_tokenize(open(argv[1]).read()) word_tokenized = word_tokenize(sent_tokenized[0]) tagger = SennaTagger('/usr/share/senna-v2.0') for a,b in tagger.tag(word_tokenized): print(b,"\11",a)
import sys from nltk.tokenize import sent_tokenize, word_tokenize from nltk.tag import SennaTagger tagger = SennaTagger('/usr/share/senna-v2.0') argv = sys.argv text = open(argv[1] ,'r').read() sentence = sent_tokenize(text) count = 0 for part in sentence: count += 1 if count < 2: word = tagger.tag(part.split()) for i in word: print(i)
def filter_task(f, whitelist_dict, foutpath, key_name): # pretrain = HunposTagger('hunpos.model', 'hunpos-1.0-linux/hunpos-tag') pretrain = SennaTagger('senna') """ Uses: namecheck() to check if word that has been tagged as name by either nltk or spacy. namecheck() first searches nameset which is generated by checking words at the sentence level and tagging names. If word is not in nameset, namecheck() uses spacy.nlp() to check if word is likely to be a name at the word level. """ with open(f, encoding='utf-8', errors='ignore') as fin: # define intial variables head, tail = os.path.split(f) #f_name = re.findall(r'[\w\d]+', tail)[0] # get the file number print(tail) start_time_single = time.time() total_records = 1 phi_containing_records = 0 safe = True screened_words = [] name_set = set() phi_reduced = '' ''' address_indictor = ['street', 'avenue', 'road', 'boulevard', 'drive', 'trail', 'way', 'lane', 'ave', 'blvd', 'st', 'rd', 'trl', 'wy', 'ln', 'court', 'ct', 'place', 'plc', 'terrace', 'ter'] ''' address_indictor = [ 'street', 'avenue', 'road', 'boulevard', 'drive', 'trail', 'way', 'lane', 'ave', 'blvd', 'st', 'rd', 'trl', 'wy', 'ln', 'court', 'ct', 'place', 'plc', 'terrace', 'ter', 'highway', 'freeway', 'autoroute', 'autobahn', 'expressway', 'autostrasse', 'autostrada', 'byway', 'auto-estrada', 'motorway', 'avenue', 'boulevard', 'road', 'street', 'alley', 'bay', 'drive', 'gardens', 'gate', 'grove', 'heights', 'highlands', 'lane', 'mews', 'pathway', 'terrace', 'trail', 'vale', 'view', 'walk', 'way', 'close', 'court', 'place', 'cove', 'circle', 'crescent', 'square', 'loop', 'hill', 'causeway', 'canyon', 'parkway', 'esplanade', 'approach', 'parade', 'park', 'plaza', 'promenade', 'quay', 'bypass' ] note = fin.read() note = re.sub(r'=', ' = ', note) # Begin Step 1: saluation check re_list = pattern_salutation.findall(note) for i in re_list: name_set = name_set | set(i[1].split(' ')) # note_length = len(word_tokenize(note)) # Begin step 2: split document into sentences note = sent_tokenize(note) for sent in note: # Begin Step 3: Pattern checking # postal code check # print(sent) if pattern_postal.findall(sent) != []: safe = False for item in pattern_postal.findall(sent): screened_words.append(item[0]) sent = str(pattern_postal.sub('**PHIPostal**', sent)) if pattern_devid.findall(sent) != []: safe = False for item in pattern_devid.findall(sent): if (re.search(r'\d', item) is not None and re.search(r'[A-Z]', item) is not None): screened_words.append(item) sent = sent.replace(item, '**PHI**') # number check if pattern_number.findall(sent) != []: safe = False for item in pattern_number.findall(sent): # print(item) #if pattern_date.match(item[0]) is None: sent = sent.replace(item[0], '**PHI**') screened_words.append(item[0]) #print(item[0]) #sent = str(pattern_number.sub('**PHI**', sent)) ''' if pattern_date.findall(sent) != []: safe = False for item in pattern_date.findall(sent): if '-' in item[0]: if (len(set(re.findall(r'[^\w\-]',item[0]))) <= 1): screened_words.append(item[0]) #print(item[0]) sent = sent.replace(item[0], '**PHIDate**') else: if len(set(re.findall(r'[^\w]',item[0]))) == 1: screened_words.append(item[0]) #print(item[0]) sent = sent.replace(item[0], '**PHIDate**') ''' data_list = [] if pattern_date.findall(sent) != []: safe = False for item in pattern_date.findall(sent): if '-' in item[0]: if (len(set(re.findall(r'[^\w\-]', item[0]))) <= 1): #screened_words.append(item[0]) #print(item[0]) data_list.append(item[0]) #sent = sent.replace(item[0], '**PHIDate**') else: if len(set(re.findall(r'[^\w]', item[0]))) == 1: #screened_words.append(item[0]) #print(item[0]) data_list.append(item[0]) #sent = sent.replace(item[0], '**PHIDate**') data_list.sort(key=len, reverse=True) for item in data_list: sent = sent.replace(item, '**PHIDate**') #sent = str(pattern_date.sub('**PHI**', sent)) #print(sent) if pattern_4digits.findall(sent) != []: safe = False for item in pattern_4digits.findall(sent): screened_words.append(item) sent = str(pattern_4digits.sub('**PHI**', sent)) # email check if pattern_email.findall(sent) != []: safe = False for item in pattern_email.findall(sent): screened_words.append(item) sent = str(pattern_email.sub('**PHI**', sent)) # url check if pattern_url.findall(sent) != []: safe = False for item in pattern_url.findall(sent): #print(item[0]) if (re.search(r'[a-z]', item[0]) is not None and '.' in item[0] and re.search(r'[A-Z]', item[0]) is None and len(item[0]) > 10): print(item[0]) screened_words.append(item[0]) sent = sent.replace(item[0], '**PHI**') #print(item[0]) #sent = str(pattern_url.sub('**PHI**', sent)) # dob check ''' re_list = pattern_dob.findall(sent) i = 0 while True: if i >= len(re_list): break else: text = ' '.join(re_list[i][0].split(' ')[-6:]) if re.findall(r'\b(birth|dob)\b', text, re.I) != []: safe = False sent = sent.replace(re_list[i][1], '**PHI**') screened_words.append(re_list[i][1]) i += 2 ''' # Begin Step 4 # substitute spaces for special characters sent = re.sub(r'[\/\-\:\~\_]', ' ', sent) # label all words for NER using the sentence level context. spcy_sent_output = nlp(sent) # split sentences into words sent = [word_tokenize(sent)] #print(sent) # Begin Step 5: context level pattern matching with regex for position in range(0, len(sent[0])): word = sent[0][position] # age check if word.isdigit() and int(word) > 90: if position <= 2: # check the words before age word_previous = ' '.join(sent[0][:position]) else: word_previous = ' '.join(sent[0][position - 2:position]) if position >= len( sent[0]) - 2: # check the words after age word_after = ' '.join(sent[0][position + 1:]) else: word_after = ' '.join(sent[0][position + 1:position + 3]) age_string = str(word_previous) + str(word_after) if pattern_age.findall(age_string) != []: screened_words.append(sent[0][position]) sent[0][position] = '**PHI**' safe = False # address check elif (position >= 1 and position < len(sent[0]) - 1 and (word.lower() in address_indictor or (word.lower() == 'dr' and sent[0][position + 1] != '.')) and (word.istitle() or word.isupper())): if sent[0][position - 1].istitle() or sent[0][position - 1].isupper(): screened_words.append(sent[0][position - 1]) sent[0][position - 1] = '**PHI**' i = position - 1 # find the closet number, should be the number of street while True: if re.findall(r'^[\d-]+$', sent[0][i]) != []: begin_position = i break elif i == 0 or position - i > 5: begin_position = position break else: i -= 1 i = position + 1 # block the info of city, state, apt number, etc. while True: if '**PHIPostal**' in sent[0][i]: end_position = i break elif i == len(sent[0]) - 1: end_position = position break else: i += 1 if end_position <= position: end_position = position for i in range(begin_position, end_position): #if sent[0][i] != '**PHIPostal**': screened_words.append(sent[0][i]) sent[0][i] = '**PHI**' safe = False # Begin Step 6: NLTK POS tagging sent_tag = nltk.pos_tag_sents(sent) #try: # senna cannot handle long sentence. #sent_tag = [[]] #length_100 = len(sent[0])//100 #for j in range(0, length_100+1): #[sent_tag[0].append(j) for j in pretrain.tag(sent[0][100*j:100*(j+1)])] # hunpos needs to change the type from bytes to string #print(sent_tag[0]) #sent_tag = [pretrain.tag(sent[0])] #for j in range(len(sent_tag[0])): #sent_tag[0][j] = list(sent_tag[0][j]) #sent_tag[0][j][1] = sent_tag[0][j][1].decode('utf-8') #except: #print('POS error:', tail, sent[0]) #sent_tag = nltk.pos_tag_sents(sent) # Begin Step 7: Use both NLTK and Spacy to check if the word is a name based on sentence level NER label for the word. for ent in spcy_sent_output.ents: # spcy_sent_output contains a dict with each word in the sentence and its NLP labels #spcy_sent_ouput.ents is a list of dictionaries containing chunks of words (phrases) that spacy believes are Named Entities # Each ent has 2 properties: text which is the raw word, and label_ which is the NER category for the word if ent.label_ == 'PERSON': #print(ent.text) # if word is person, recheck that spacy still thinks word is person at the word level spcy_chunk_output = nlp(ent.text) if spcy_chunk_output.ents != ( ) and spcy_chunk_output.ents[0].label_ == 'PERSON': # Now check to see what labels NLTK provides for the word name_tag = word_tokenize(ent.text) # senna & hunpos #name_tag = pretrain.tag(name_tag) # hunpos needs to change the type from bytes to string #for j in range(len(name_tag)): #name_tag[j] = list(name_tag[j]) #name_tag[j][1] = name_tag[j][1].decode('utf-8') #chunked = ne_chunk(name_tag) # default name_tag = pos_tag_sents([name_tag]) chunked = ne_chunk(name_tag[0]) for i in chunked: if type( i ) == Tree: # if ne_chunck thinks chunk is NER, creates a tree structure were leaves are the words in the chunk (and their POS labels) and the trunk is the single NER label for the chunk if i.label() == 'PERSON': for token, pos in i.leaves(): if pos == 'NNP': name_set.add(token) else: for token, pos in i.leaves(): spcy_upper_output = nlp(token.upper()) if spcy_upper_output.ents != (): name_set.add(token) # BEGIN STEP 8: whitelist check # sent_tag is the nltk POS tagging for each word at the sentence level. for i in range(len(sent_tag[0])): # word contains the i-th word and it's POS tag word = sent_tag[0][i] # print(word) # word_output is just the raw word itself word_output = word[0] if word_output not in string.punctuation: word_check = str(pattern_word.sub('', word_output)) #if word_check.title() in ['Dr', 'Mr', 'Mrs', 'Ms']: #print(word_check) # remove the speical chars try: # word[1] is the pos tag of the word if (((word[1] == 'NN' or word[1] == 'NNP') or ((word[1] == 'NNS' or word[1] == 'NNPS') and word_check.istitle()))): if word_check.lower() not in whitelist_dict: screened_words.append(word_output) word_output = "**PHI**" safe = False else: # For words that are in whitelist, check to make sure that we have not identified them as names if ((word_output.istitle() or word_output.isupper()) and pattern_name.findall(word_output) != [] and re.search(r'\b([A-Z])\b', word_check) is None): word_output, name_set, screened_words, safe = namecheck( word_output, name_set, screened_words, safe) # check day/year according to the month name elif word[1] == 'CD': if i > 2: context_before = sent_tag[0][i - 3:i] else: context_before = sent_tag[0][0:i] if i <= len(sent_tag[0]) - 4: context_after = sent_tag[0][i + 1:i + 4] else: context_after = sent_tag[0][i + 1:] #print(word_output, context_before+context_after) for j in (context_before + context_after): if pattern_mname.search(j[0]) is not None: screened_words.append(word_output) #print(word_output) word_output = "**PHI**" safe = False break else: word_output, name_set, screened_words, safe = namecheck( word_output, name_set, screened_words, safe) except: print(word_output, sys.exc_info()) if word_output.lower()[0] == '\'s': if phi_reduced[-7:] != '**PHI**': phi_reduced = phi_reduced + word_output #print(word_output) else: phi_reduced = phi_reduced + ' ' + word_output # Format output for later use by eval.py else: if (i > 0 and sent_tag[0][i - 1][0][-1] in string.punctuation and sent_tag[0][i - 1][0][-1] != '*'): phi_reduced = phi_reduced + word_output elif word_output == '.' and sent_tag[0][i - 1][0] in [ 'Dr', 'Mr', 'Mrs', 'Ms' ]: phi_reduced = phi_reduced + word_output else: phi_reduced = phi_reduced + ' ' + word_output #print(phi_reduced) # Begin Step 8: check middle initial and month name if pattern_mname.findall(phi_reduced) != []: for item in pattern_mname.findall(phi_reduced): screened_words.append(item[0]) phi_reduced = pattern_mname.sub('**PHI**', phi_reduced) if pattern_middle.findall(phi_reduced) != []: for item in pattern_middle.findall(phi_reduced): # print(item[0]) screened_words.append(item[0]) phi_reduced = pattern_middle.sub('**PHI** **PHI** ', phi_reduced) # print(phi_reduced) if not safe: phi_containing_records = 1 # save phi_reduced file filename = '.'.join(tail.split('.')[:-1]) + "_" + key_name + ".txt" filepath = os.path.join(foutpath, filename) with open(filepath, "w") as phi_reduced_note: phi_reduced_note.write(phi_reduced) # save filtered words #screened_words = list(filter(lambda a: a!= '**PHI**', screened_words)) filepath = os.path.join(foutpath, 'filter_summary.txt') #print(filepath) screened_words = list( filter(lambda a: '**PHI' not in a, screened_words)) #screened_words = list(filter(lambda a: a != '**PHI**', screened_words)) #print(screened_words) with open(filepath, 'a') as fout: fout.write('.'.join(tail.split('.')[:-1]) + ' ' + str(len(screened_words)) + ' ' + ' '.join(screened_words) + '\n') # fout.write(' '.join(screened_words)) print(total_records, f, "--- %s seconds ---" % (time.time() - start_time_single)) # hunpos needs to close session #pretrain.close() return total_records, phi_containing_records
def __init__(self): self.tagger = SennaTagger('/app/util/senna')
class CRF_Extractor: ''' extract features for the CRF model each line is a feature vector for a token ''' def __init__(self): ''' if phrase_dict_json != None: extract the phrase features if subtype_flag = True, extract the features by sub parse_type if bioe_flag = True, use the BIOE tags ''' self.features = ['pos', 'chunk', 'promptword', 'stopword', 'tf', 'rank', 'color'] if 'pos' in self.features: self.pos_tagger = SennaTagger(global_params.sennadir) if 'chunk' in self.features: self.chunk_tagger = SennaChunkTagger(global_params.sennadir) self.sentences = [] self.porter = PorterStemmer() self.token_dict = None self.bins = 50 def add_sentence(self, sentence): self.sentences.append(sentence) def get_token_tf(self): self.token_dict = defaultdict(float) for tokens, _, _ in self.sentences: for token in self.porter.stem_tokens(tokens): self.token_dict[token] += 1.0 self.rank_dict = defaultdict(int) rank_tokens = sorted(self.token_dict, key=self.token_dict.get, reverse=True) self.rank_dict = defaultdict(int) for i, token in enumerate(rank_tokens): self.rank_dict[token] = int(i*10/len(rank_tokens)) for t, v in self.token_dict.items(): #normalized by the number of sentences x = v/len(self.sentences) if x > 1.0: x = 1.0 self.token_dict[t] = x def get_feature_names(self): return '_'.join(self.features) def get_i_j(self, body, i, j): ''' return the value of the crf template feature u[i, j] intput: body: [][], two-dimentionary array, representing the crf features for a sentence i: int, the index of i j: int, the index of j ''' n = len(body) if i < 0: v = '_x%d'%(i) elif i >= n: v = '_x+%d'%(i-n+1) else: v = body[i][j] return v def extract_U_i_j(self, data_body, feature_body, i, j, tag): ''' extract the U[i, j] feature, and add it to the end of each row intput: data_body: [][], two-dimentionary array, representing the crf data for a sentence feature_body: [][], two-dimentionary array, the resulting feature data for a sentence i: int, the index of i j: int, the index of j tag: the prefix of the feature name ''' for k, row in enumerate(feature_body): row.append('%s:%s'%(tag, self.get_i_j(data_body, k+i, j))) def extract_U_i_j_m_n(self, data_body, feature_body, i, j, m, n, tag): ''' extract the U[i, j]/U[m, n] feature, and add it to the end of each row ''' for k, row in enumerate(feature_body): row.append('%s:%s/%s'%(tag, self.get_i_j(data_body, k+i, j), self.get_i_j(data_body, k+m, n))) def extract_U_i_j_m_n_x_y(self, data_body, feature_body, i, j, m, n, x, y, tag): ''' extract the U[i, j]/U[m, n]/U[x,y] feature, and add it to the end of each row ''' for k, row in enumerate(feature_body): row.append('%s:%s/%s/%s'%(tag, self.get_i_j(data_body, k+i, j), self.get_i_j(data_body, k+m, n), self.get_i_j(data_body, k+x, y))) def extract_word_U_i_j(self, data_body, index, feature_body, i, j, tag): ''' extract the U[i, j] feature, and add it to the end of each row ''' for k, row in zip(index, feature_body): row.append('%s:%s'%(tag, self.get_i_j(data_body, k+i, j))) def extract_word_U_i_j_m_n(self, data_body, index, feature_body, i, j, m, n, tag): ''' extract the U[i, j]/U[m, n] feature, and add it to the end of each row ''' for k, row in zip(index, feature_body): row.append('%s:%s/%s'%(tag, self.get_i_j(data_body, k+i, j), self.get_i_j(data_body, k+m, n))) def extract_word_U_i_j_m_n_x_y(self, data_body, index, feature_body, i, j, m, n, x, y, tag): ''' extract the U[i, j]/U[m, n] feature, and add it to the end of each row ''' for k, row in zip(index, feature_body): row.append('%s:%s/%s/%s'%(tag, self.get_i_j(data_body, k+i, j), self.get_i_j(data_body, k+m, n), self.get_i_j(data_body, k+x, y))) def extract_bigram(self, body): ''' extract the bigram feature for the crf template ''' for row in body: row.append('b') def extract_crf_features(self, tokens, tags, prompt, colors=None): ''' Extract the character features, each token a line return: [][], two dimentionary array, representing the feature data of the sentence ''' body = [] words = tokens N = len(tokens) #first row: the word token for word in words: row = [] row.append(word) body.append(row) if 'pos' in self.features: pos_tags = self.pos_tagger.tag(tokens) for i, (_, p_tag) in enumerate(pos_tags): body[i].append(p_tag) if 'chunk' in self.features: chunk_tags = self.chunk_tagger.tag(tokens) for i, (_, c_tag) in enumerate(chunk_tags): body[i].append(c_tag) if 'promptword' in self.features: for i, token in enumerate(tokens): if token in prompt_words[prompt]: body[i].append('Y') else: body[i].append('N') if 'stopword' in self.features: for i, token in enumerate(tokens): if token in stopwords: body[i].append('Y') else: body[i].append('N') if 'tf' in self.features: if self.token_dict == None: self.get_token_tf() for i, token in enumerate(self.porter.stem_tokens(tokens)): assert(token in self.token_dict) x = int(self.token_dict[token]*self.bins) body[i].append(str(x)) if 'rank' in self.features: if self.rank_dict == None: self.get_token_tf() for i, token in enumerate(self.porter.stem_tokens(tokens)): assert(token in self.rank_dict) x = self.rank_dict[token] body[i].append(str(x)) if 'color' in self.features and colors != None: for color in colors: for i, tag in enumerate(tags): body[i].append(str(color[i])) #last row: tags = [tag for tag in tags] for i, tag in enumerate(tags): body[i].append(tag) return body
from nltk.tokenize import RegexpTokenizer, sent_tokenize from nltk.tag import SennaTagger from nltk import ne_chunk, pos_tag, defaultdict from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import pandas as pd '''--------------------------------------Exercise 1------------------------------------------------------------------''' ''' Exercise sheet Lab1_TextProcessing.pdf ''' #nltk.download('averaged_perceptron_tagger') #nltk.download('maxent_ne_chunker') vectorizer = CountVectorizer() '''----------------SENNA Tagger-------------------''' sena_tagger = SennaTagger( '/home/starksultana/Documentos/MEIC/5o_ano/1o semestre/PRI/Labs/lab1/senna-v3.0/senna' ) print("----------------EXERCISE 1-------------------") #Exercise 1.1 def partition(A, low, high): pivot = A[low] leftwall = low for i in range(low, high + 1): #ns s ta bem no pseudo dizia low+1 if (A[i] < pivot): leftwall += 1 A[leftwall], A[i] = A[i], A[leftwall] A[leftwall], A[low] = A[low], A[leftwall]
# -*- coding: utf-8 -*- """ Created on Sun May 14 12:37:50 2017 @author: Shanika Ediriweera """ from nltk import word_tokenize from nltk.tag import SennaTagger senna = SennaTagger('../../tools/senna') sents = ["All the banks are closed", "Today is Sunday"] tokenized_sents = [word_tokenize(sent) for sent in sents] print(senna.tag_sents(tokenized_sents))
#Imports import os, nltk import re from nltk.tag import SennaTagger, SennaChunkTagger from nltk.tokenize import sent_tokenize #Constants SOURCE_DIR = '../data/annotated/' SENNA_INPUT_DIR_RESPS = '../data/senna_input_resps/' SENNA_INPUT_DIR_SENTS = '../data/senna_input_sents/' SENNA_DEST_DIR = '../data/senna_wordlist/' SENNA_EXECUTABLE_DIR = '../../tools/senna' """ for now these taggers are not used. SENNA tagging is done mannually using a shell script """ pos_tagger = SennaTagger(SENNA_EXECUTABLE_DIR) chunk_tagger = SennaChunkTagger(SENNA_EXECUTABLE_DIR) def add_space_between_sentences(text): """ Add space between sentences where no space is added after period """ space_added_txt = re.sub(r"(\w+)\.(\w+)", r"\1. \2", text) return space_added_txt def add_space_between_sentence_and_period(text, text_type): """ Add space between sentence and period. This is needed for SENNA to tokenize sentences.
import sys from nltk.tokenize import sent_tokenize, word_tokenize from nltk.tag import SennaTagger fr = open(sys.argv[1]).read() sent = sent_tokenize(fr) aword = word_tokenize(sent[0]) tagger = SennaTagger('/usr/share/senna-v2.0') for w, t in tagger.tag(aword): print(w, t)
class CRF_Extractor: ''' extract features for the CRF model each line is a feature vector for a token ''' def __init__(self): ''' if phrase_dict_json != None: extract the phrase features if subtype_flag = True, extract the features by sub parse_type if bioe_flag = True, use the BIOE tags ''' self.features = [ 'pos', 'chunk', 'promptword', 'stopword', 'tf', 'rank', 'color' ] if 'pos' in self.features: self.pos_tagger = SennaTagger(global_params.sennadir) if 'chunk' in self.features: self.chunk_tagger = SennaChunkTagger(global_params.sennadir) self.sentences = [] self.porter = PorterStemmer() self.token_dict = None self.bins = 50 def add_sentence(self, sentence): self.sentences.append(sentence) def get_token_tf(self): self.token_dict = defaultdict(float) for tokens, _, _ in self.sentences: for token in self.porter.stem_tokens(tokens): self.token_dict[token] += 1.0 self.rank_dict = defaultdict(int) rank_tokens = sorted(self.token_dict, key=self.token_dict.get, reverse=True) self.rank_dict = defaultdict(int) for i, token in enumerate(rank_tokens): self.rank_dict[token] = int(i * 10 / len(rank_tokens)) for t, v in self.token_dict.items( ): #normalized by the number of sentences x = v / len(self.sentences) if x > 1.0: x = 1.0 self.token_dict[t] = x def get_feature_names(self): return '_'.join(self.features) def get_i_j(self, body, i, j): ''' return the value of the crf template feature u[i, j] intput: body: [][], two-dimentionary array, representing the crf features for a sentence i: int, the index of i j: int, the index of j ''' n = len(body) if i < 0: v = '_x%d' % (i) elif i >= n: v = '_x+%d' % (i - n + 1) else: v = body[i][j] return v def extract_U_i_j(self, data_body, feature_body, i, j, tag): ''' extract the U[i, j] feature, and add it to the end of each row intput: data_body: [][], two-dimentionary array, representing the crf data for a sentence feature_body: [][], two-dimentionary array, the resulting feature data for a sentence i: int, the index of i j: int, the index of j tag: the prefix of the feature name ''' for k, row in enumerate(feature_body): row.append('%s:%s' % (tag, self.get_i_j(data_body, k + i, j))) def extract_U_i_j_m_n(self, data_body, feature_body, i, j, m, n, tag): ''' extract the U[i, j]/U[m, n] feature, and add it to the end of each row ''' for k, row in enumerate(feature_body): row.append('%s:%s/%s' % (tag, self.get_i_j( data_body, k + i, j), self.get_i_j(data_body, k + m, n))) def extract_U_i_j_m_n_x_y(self, data_body, feature_body, i, j, m, n, x, y, tag): ''' extract the U[i, j]/U[m, n]/U[x,y] feature, and add it to the end of each row ''' for k, row in enumerate(feature_body): row.append('%s:%s/%s/%s' % (tag, self.get_i_j( data_body, k + i, j), self.get_i_j( data_body, k + m, n), self.get_i_j(data_body, k + x, y))) def extract_word_U_i_j(self, data_body, index, feature_body, i, j, tag): ''' extract the U[i, j] feature, and add it to the end of each row ''' for k, row in zip(index, feature_body): row.append('%s:%s' % (tag, self.get_i_j(data_body, k + i, j))) def extract_word_U_i_j_m_n(self, data_body, index, feature_body, i, j, m, n, tag): ''' extract the U[i, j]/U[m, n] feature, and add it to the end of each row ''' for k, row in zip(index, feature_body): row.append('%s:%s/%s' % (tag, self.get_i_j( data_body, k + i, j), self.get_i_j(data_body, k + m, n))) def extract_word_U_i_j_m_n_x_y(self, data_body, index, feature_body, i, j, m, n, x, y, tag): ''' extract the U[i, j]/U[m, n] feature, and add it to the end of each row ''' for k, row in zip(index, feature_body): row.append('%s:%s/%s/%s' % (tag, self.get_i_j( data_body, k + i, j), self.get_i_j( data_body, k + m, n), self.get_i_j(data_body, k + x, y))) def extract_bigram(self, body): ''' extract the bigram feature for the crf template ''' for row in body: row.append('b') def extract_crf_features(self, tokens, tags, prompt, colors=None): ''' Extract the character features, each token a line return: [][], two dimentionary array, representing the feature data of the sentence ''' body = [] words = tokens N = len(tokens) #first row: the word token for word in words: row = [] row.append(word) body.append(row) if 'pos' in self.features: pos_tags = self.pos_tagger.tag(tokens) for i, (_, p_tag) in enumerate(pos_tags): body[i].append(p_tag) if 'chunk' in self.features: chunk_tags = self.chunk_tagger.tag(tokens) for i, (_, c_tag) in enumerate(chunk_tags): body[i].append(c_tag) if 'promptword' in self.features: for i, token in enumerate(tokens): if token in prompt_words[prompt]: body[i].append('Y') else: body[i].append('N') if 'stopword' in self.features: for i, token in enumerate(tokens): if token in stopwords: body[i].append('Y') else: body[i].append('N') if 'tf' in self.features: if self.token_dict == None: self.get_token_tf() for i, token in enumerate(self.porter.stem_tokens(tokens)): assert (token in self.token_dict) x = int(self.token_dict[token] * self.bins) body[i].append(str(x)) if 'rank' in self.features: if self.rank_dict == None: self.get_token_tf() for i, token in enumerate(self.porter.stem_tokens(tokens)): assert (token in self.rank_dict) x = self.rank_dict[token] body[i].append(str(x)) if 'color' in self.features and colors != None: for color in colors: for i, tag in enumerate(tags): body[i].append(str(color[i])) #last row: tags = [tag for tag in tags] for i, tag in enumerate(tags): body[i].append(tag) return body
#!/usr/bin/env python #-*- coding: utf-8 -*- import sys import nltk from nltk.tokenize import sent_tokenize, word_tokenize from nltk.tag import SennaTagger argvs = sys.argv argc = len(argvs) #引数が不正の場合はメッセージを表示する if (argc != 2): print('Usage: # python %s filename' % argvs[0]) quit() #タガー準備 tagger = SennaTagger('/usr/share/senna-v2.0') #文分割 openedFile = open(argvs[1]).read() sent_tokenize_list = sent_tokenize(openedFile) #1行目の単語分割 word_tokenize_list = word_tokenize(sent_tokenize_list[0]) #タグ付け for w, t in tagger.tag(word_tokenize_list): print(w, t)
'put': ["perform", "mark", "evaluate", "update", "set", "change", "edit"], 'delete': ["delete", "destroy", "kill", "remove", "cancel"] } # response code lists L404 = [ 'not found', 'doesn\'t exist', 'does not exist', 'unable to find', 'can\'t find' ] L401 = ['unauthorized', 'not allowed', 'rejected', 'denied'] L400 = ['failed', 'unsuccessful'] # st = StanfordPOSTagger("C:/Users/Tasos/OneDriveThesis/Thesis/src/lib/stanford-postagger-full-2015-12-09/models/english-left3words-distsim.tagger", # "C:/Users/Tasos/OneDriveThesis/Thesis/src/lib/stanford-postagger-full-2015-12-09/stanford-postagger.jar") senna_tagger = SennaTagger( "C:/Users/Tasos/OneDriveThesis/Thesis/src/lib/senna") p = inflect.engine() def resource_analysis(resources, resource_names): model = {} hateoas_graph = {} for resource, scenarios in resources.items(): hateoas_graph[resource] = [] model[resource] = { 'get': { 'request_params': [], 'response': { 'params': [], 'links': [] }
import sys from nltk.tokenize import sent_tokenize,word_tokenize from nltk.tag import SennaTagger f=open(sys.argv[1],'r') lines=sent_tokenize(f.read()) tagger=SennaTagger('/usr/share/senna-v2.0') words=word_tokenize(lines[1]) print(tagger.tag(words))