Exemple #1
0
 def test_senna_tagger(self):
     tagger = SennaTagger(SENNA_EXECUTABLE_PATH)
     result = tagger.tag('What is the airspeed of an unladen swallow ?'.split())
     expected = [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed',
         'NN'),('of', 'IN'), ('an', 'DT'), ('unladen', 'NN'), ('swallow',
         'NN'), ('?', '.')]
     self.assertEqual(result, expected)
Exemple #2
0
 def test_senna_tagger(self):
     tagger = SennaTagger(SENNA_EXECUTABLE_PATH)
     result = tagger.tag(
         'What is the airspeed of an unladen swallow ?'.split())
     expected = [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'),
                 ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'),
                 ('unladen', 'NN'), ('swallow', 'NN'), ('?', '.')]
     self.assertEqual(result, expected)
Exemple #3
0
 def test_senna_tagger(self):
     tagger = SennaTagger(SENNA_EXECUTABLE_PATH)
     result = tagger.tag("What is the airspeed of an unladen swallow ?".split())
     expected = [
         ("What", "WP"),
         ("is", "VBZ"),
         ("the", "DT"),
         ("airspeed", "NN"),
         ("of", "IN"),
         ("an", "DT"),
         ("unladen", "NN"),
         ("swallow", "NN"),
         ("?", "."),
     ]
     self.assertEqual(result, expected)
 def __init__(self, config):
     '''
     learning_algorithm -- the algorithm to train with
         (default "SVM")
     '''
 
     self.training_file = config.get('model', 'train')
     self.learning_algorithm = config.get('model', 'classify')
     self.features = config.get('model', 'features').split(',')
     #print self.features
     
     self.course =  config.get('model', 'course')
     self.test_file = '../data/' + self.course + '.json'
     
     self._model = None
     
     if 'pos' in self.features:
         self.tagger = SennaTagger(config.get('model', 'senna'))
     
     if 'content' in self.features:
         self.contentwords = [line.strip().lower() for line in open(config.get('model', 'content')).readlines()]
     
     if 'organization' in self.features:
         self.orgnizationwords = [line.strip().lower() for line in open(config.get('model', "organization")).readlines()]
     
     featuresets = self._get_training_data()
     self._train_classifier_model(featuresets)
Exemple #5
0
class Senna():
    '''SennaTagger'''
    
    def __init__(self,
                path='senna',
                **kwargs):   
        
        self.__dict__.update(kwargs)
        
        if not os.path.isabs(path):
            current_dir = os.path.dirname(os.path.abspath(__file__))
            path = os.path.join(current_dir, path)
        
        paths = (
                path,
                os.path.join(sys.exec_prefix, r'lib\site-packages', 'senna'),
                os.path.join(MODULEDIR, 'bin', 'senna')
        )
        
        for path in paths:
            if os.path.exists(path):
               break
        else:
            raise FileNotFoundError(paths) 
         
        self.tagger = SennaTagger(path, **kwargs)
  
    def __call__(self, tokens):
        return  self.tagger.tag(tokens)  
Exemple #6
0
    def __init__(self):
        '''
        if phrase_dict_json != None: extract the phrase features
        if subtype_flag = True, extract the features by sub parse_type
        if bioe_flag = True, use the BIOE tags
        '''
        self.features = [
            'pos', 'chunk', 'promptword', 'stopword', 'tf', 'rank', 'color'
        ]

        if 'pos' in self.features:
            self.pos_tagger = SennaTagger(global_params.sennadir)

        if 'chunk' in self.features:
            self.chunk_tagger = SennaChunkTagger(global_params.sennadir)

        self.sentences = []

        self.porter = PorterStemmer()

        self.token_dict = None
        self.bins = 50
 def __init__(self):
     '''
     if phrase_dict_json != None: extract the phrase features
     if subtype_flag = True, extract the features by sub parse_type
     if bioe_flag = True, use the BIOE tags
     '''
     self.features = ['pos', 'chunk', 'promptword', 'stopword', 'tf', 'rank', 'color']
     
     if 'pos' in self.features:
         self.pos_tagger = SennaTagger(global_params.sennadir)
     
     if 'chunk' in self.features:
         self.chunk_tagger = SennaChunkTagger(global_params.sennadir)
     
     self.sentences = []
     
     self.porter = PorterStemmer()
     
     self.token_dict = None
     self.bins = 50
Exemple #8
0
class Tagger:
    """Tag words' part of speech in sentences."""
    def __init__(self):
        self.tagger = SennaTagger('/app/util/senna')

    def tag(self, tokens_in_sentence):
        """Tag tokens in sentences with their part of speech.

        :param tokens_in_sentence: list of tokens, grouped by sentence.
        :return: iterator of lists with words.
        """
        return ((Word(t, self._parse_POS(POS)) for t, POS in s)
                for s in self.tagger.tag_sents(tokens_in_sentence))

    def _parse_POS(self, tag):
        if tag.startswith('J'):
            return PartOfSpeach.ADJECTIVE
        elif tag.startswith('V'):
            return PartOfSpeach.VERB
        elif tag.startswith('N'):
            return PartOfSpeach.NOUN
        elif tag.startswith('R'):
            return PartOfSpeach.ADVERB
        return PartOfSpeach.OTHER
                    EventStructures['Who'] = text
                elif 'A2' == arg:
                    EventStructures['Whom'] = text
                text = labels[i][1][0]
                Args.append(text)
        else:
            text += ' ' + labels[i][1][0]

    print(EventStructures)
    return Args


srltagger = SennaSRLTagger(path)
nertagger = SennaNERTagger(path)
chktagger = SennaChunkTagger(path)
tagger = SennaTagger(path)

#w = s.tag("Are you studying here?".split())
#w = s.tag("""A general interface to the SENNA pipeline that supports any of the operations specified in SUPPORTED OPERATIONS..""".split())

#print(tagger.tag(sents))
#print('\n___________________\n')
#print(chktagger.tag(sents))
#print('\n___________________\n')
#print(nertagger.tag(sents))
#print('\n___________________\n')
#print(srltagger.tag(sents))
#print('\n___________________\n')
#text = sent
NE_Tagger(text)
#print('\n'.join(str(e) for e in NE_Tagger(sents)))
Exemple #10
0
#define var
cnt = 0  #counter
flist = []  #file list
linked_file = ""  #linked file
ifile = ""  #input file data
splited_file = []  #splited file
taged_file = []  #taged file
f = ""  #filename

import os.path
import nltk

#import Senna Tagger
from nltk.tag import SennaTagger

tagger = SennaTagger('/usr/share/senna-v3.0')

#loop
while True:
    #import data
    cnt += 1
    ifile = input("please inputfile" + str(cnt) +
                  "(e to end input / q to quit) : ")

    #escape from loop
    if ifile == "e":
        break

    elif ifile == "q":
        quit()
class QualityPrediction:
    def __init__(self, config):
        '''
        learning_algorithm -- the algorithm to train with
            (default "SVM")
        '''
    
        self.training_file = config.get('model', 'train')
        self.learning_algorithm = config.get('model', 'classify')
        self.features = config.get('model', 'features').split(',')
        #print self.features
        
        self.course =  config.get('model', 'course')
        self.test_file = '../data/' + self.course + '.json'
        
        self._model = None
        
        if 'pos' in self.features:
            self.tagger = SennaTagger(config.get('model', 'senna'))
        
        if 'content' in self.features:
            self.contentwords = [line.strip().lower() for line in open(config.get('model', 'content')).readlines()]
        
        if 'organization' in self.features:
            self.orgnizationwords = [line.strip().lower() for line in open(config.get('model', "organization")).readlines()]
        
        featuresets = self._get_training_data()
        self._train_classifier_model(featuresets)
        
    
    def evaluate(self):
        test_featureset = self._get_featuresets(self.test_file)
        
        labels = [int(x[1]) for x in test_featureset]
        featureset = [x[0] for x in test_featureset]
        predicts = [int(x) for x in self._model.classify_many(featureset)]
        
        metric = Metric()
        
        return metric.accuracy(labels, predicts), metric.kappa(labels, predicts), metric.QWkappa(labels, predicts)
    
    def get_features(self, text, cid, lecture):
        features = {}
        
        #unigram
        tokens = nltk.word_tokenize(text)
        
        if 'WC' in self.features:
            features['WC'] = len(tokens)
       
        if 'unigram' in self.features:
            for token in tokens:
                features['U0_'+token.lower()] = 1
        
        if 'pos' in self.features:
            tags = self.tagger.tag(tokens)
            for _, tag in tags:
                features['P0_'+tag] = 1
        
        if 'content' in self.features:
            hasContentWord = 0
            for word in tokens:
                if word.lower() in self.contentwords:
                    hasContentWord = 1
                    break
            features['C0_'] = hasContentWord
        
        if 'organization' in self.features:
            OrgAssign = 0
            for word in tokens:
                if word.lower() in self.orgnizationwords:
                    OrgAssign = 1
                    break
            features['O0_'] = OrgAssign
                    
        return features
        
    def get_model(self):
        """An accessor method for the model."""
        return self._model
    
    def _get_featuresets(self, input):
        featuresets = []
        
        MPLectures = file_util.LoadDictJson(input)
        
        for week, MPs in enumerate(MPLectures):
            if MPs == []: continue
            
            for k, (MP, score) in enumerate(MPs):
                features = self.get_features(MP, week, 'Engineer')
                featuresets.append((features,score))
        
        return featuresets
        
    def _get_training_data(self):
        """Builds and returns positive and negative feature sets
        for the algorithm

        """
        featuresets = self._get_featuresets(self.training_file)
        return featuresets
    
    def _train_classifier_model(self, featuresets):
        """This changes the algorithm that nltk uses to train the model.

        Arguments:
        featuresets -- array of features generated for training

        """
        model = None
        if(self.learning_algorithm == "NB"):
            model = nltk.NaiveBayesClassifier.train(featuresets)
        elif(self.learning_algorithm == "MaxEnt"):
            model = nltk.MaxentClassifier.train(featuresets, "MEGAM",
                                                 max_iter=15)
        elif(self.learning_algorithm == "DecisionTree"):
            model = nltk.DecisionTreeClassifier.train(featuresets, 0.05)
        elif(self.learning_algorithm == 'SVM'):
            model = SklearnClassifier(SVC(kernel='linear')).train(featuresets)
        self._model = model
        
    def predict(self, text, cid=None, lecture=None):
        features = self.get_features(text, cid, lecture)
        return self._model.classify(features)
import sys
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import SennaTagger

argv = sys.argv

sent_tokenized = sent_tokenize(open(argv[1]).read())
word_tokenized = word_tokenize(sent_tokenized[0])

tagger = SennaTagger('/usr/share/senna-v2.0')

for a,b in tagger.tag(word_tokenized):
    print(b,"\11",a)
import sys
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tag import SennaTagger
tagger = SennaTagger('/usr/share/senna-v2.0')
argv = sys.argv
text = open(argv[1] ,'r').read()
sentence = sent_tokenize(text)
count = 0
for part in sentence:
    count += 1
    if count < 2:
        word = tagger.tag(part.split())
        for i in word:
            print(i)
Exemple #14
0
def filter_task(f, whitelist_dict, foutpath, key_name):

    # pretrain = HunposTagger('hunpos.model', 'hunpos-1.0-linux/hunpos-tag')
    pretrain = SennaTagger('senna')
    """
    Uses: namecheck() to check if word that has been tagged as name by either nltk or spacy. namecheck() first searches
    nameset which is generated by checking words at the sentence level and tagging names. If word is not in nameset,
    namecheck() uses spacy.nlp() to check if word is likely to be a name at the word level. 

    """
    with open(f, encoding='utf-8', errors='ignore') as fin:
        # define intial variables
        head, tail = os.path.split(f)
        #f_name = re.findall(r'[\w\d]+', tail)[0]  # get the file number
        print(tail)
        start_time_single = time.time()
        total_records = 1
        phi_containing_records = 0
        safe = True
        screened_words = []
        name_set = set()
        phi_reduced = ''
        '''
        address_indictor = ['street', 'avenue', 'road', 'boulevard',
                            'drive', 'trail', 'way', 'lane', 'ave',
                            'blvd', 'st', 'rd', 'trl', 'wy', 'ln',
                            'court', 'ct', 'place', 'plc', 'terrace', 'ter']
                            '''
        address_indictor = [
            'street', 'avenue', 'road', 'boulevard', 'drive', 'trail', 'way',
            'lane', 'ave', 'blvd', 'st', 'rd', 'trl', 'wy', 'ln', 'court',
            'ct', 'place', 'plc', 'terrace', 'ter', 'highway', 'freeway',
            'autoroute', 'autobahn', 'expressway', 'autostrasse', 'autostrada',
            'byway', 'auto-estrada', 'motorway', 'avenue', 'boulevard', 'road',
            'street', 'alley', 'bay', 'drive', 'gardens', 'gate', 'grove',
            'heights', 'highlands', 'lane', 'mews', 'pathway', 'terrace',
            'trail', 'vale', 'view', 'walk', 'way', 'close', 'court', 'place',
            'cove', 'circle', 'crescent', 'square', 'loop', 'hill', 'causeway',
            'canyon', 'parkway', 'esplanade', 'approach', 'parade', 'park',
            'plaza', 'promenade', 'quay', 'bypass'
        ]

        note = fin.read()
        note = re.sub(r'=', ' = ', note)
        # Begin Step 1: saluation check
        re_list = pattern_salutation.findall(note)
        for i in re_list:
            name_set = name_set | set(i[1].split(' '))

        # note_length = len(word_tokenize(note))
        # Begin step 2: split document into sentences
        note = sent_tokenize(note)

        for sent in note:  # Begin Step 3: Pattern checking
            # postal code check
            # print(sent)
            if pattern_postal.findall(sent) != []:
                safe = False
                for item in pattern_postal.findall(sent):
                    screened_words.append(item[0])
            sent = str(pattern_postal.sub('**PHIPostal**', sent))

            if pattern_devid.findall(sent) != []:
                safe = False
                for item in pattern_devid.findall(sent):
                    if (re.search(r'\d', item) is not None
                            and re.search(r'[A-Z]', item) is not None):
                        screened_words.append(item)
                        sent = sent.replace(item, '**PHI**')

            # number check
            if pattern_number.findall(sent) != []:
                safe = False
                for item in pattern_number.findall(sent):
                    # print(item)
                    #if pattern_date.match(item[0]) is None:
                    sent = sent.replace(item[0], '**PHI**')
                    screened_words.append(item[0])
                    #print(item[0])
            #sent = str(pattern_number.sub('**PHI**', sent))
            '''
            if pattern_date.findall(sent) != []:
                safe = False
                for item in pattern_date.findall(sent):
                    if '-' in item[0]:
                        if (len(set(re.findall(r'[^\w\-]',item[0]))) <= 1):
                            screened_words.append(item[0])
                            #print(item[0])
                            sent = sent.replace(item[0], '**PHIDate**')
                    else:
                        if len(set(re.findall(r'[^\w]',item[0]))) == 1:
                            screened_words.append(item[0])
                            #print(item[0])
                            sent = sent.replace(item[0], '**PHIDate**')
            '''
            data_list = []
            if pattern_date.findall(sent) != []:
                safe = False
                for item in pattern_date.findall(sent):
                    if '-' in item[0]:
                        if (len(set(re.findall(r'[^\w\-]', item[0]))) <= 1):
                            #screened_words.append(item[0])
                            #print(item[0])
                            data_list.append(item[0])
                            #sent = sent.replace(item[0], '**PHIDate**')
                    else:
                        if len(set(re.findall(r'[^\w]', item[0]))) == 1:
                            #screened_words.append(item[0])
                            #print(item[0])
                            data_list.append(item[0])
                            #sent = sent.replace(item[0], '**PHIDate**')
            data_list.sort(key=len, reverse=True)
            for item in data_list:
                sent = sent.replace(item, '**PHIDate**')

            #sent = str(pattern_date.sub('**PHI**', sent))
            #print(sent)
            if pattern_4digits.findall(sent) != []:
                safe = False
                for item in pattern_4digits.findall(sent):
                    screened_words.append(item)
            sent = str(pattern_4digits.sub('**PHI**', sent))
            # email check
            if pattern_email.findall(sent) != []:
                safe = False
                for item in pattern_email.findall(sent):
                    screened_words.append(item)
            sent = str(pattern_email.sub('**PHI**', sent))
            # url check
            if pattern_url.findall(sent) != []:
                safe = False
                for item in pattern_url.findall(sent):
                    #print(item[0])
                    if (re.search(r'[a-z]', item[0]) is not None
                            and '.' in item[0]
                            and re.search(r'[A-Z]', item[0]) is None
                            and len(item[0]) > 10):
                        print(item[0])
                        screened_words.append(item[0])
                        sent = sent.replace(item[0], '**PHI**')
                        #print(item[0])
            #sent = str(pattern_url.sub('**PHI**', sent))
            # dob check
            '''
            re_list = pattern_dob.findall(sent)
            i = 0
            while True:
                if i >= len(re_list):
                    break
                else:
                    text = ' '.join(re_list[i][0].split(' ')[-6:])
                    if re.findall(r'\b(birth|dob)\b', text, re.I) != []:
                        safe = False
                        sent = sent.replace(re_list[i][1], '**PHI**')
                        screened_words.append(re_list[i][1])
                    i += 2
            '''

            # Begin Step 4
            # substitute spaces for special characters
            sent = re.sub(r'[\/\-\:\~\_]', ' ', sent)
            # label all words for NER using the sentence level context.
            spcy_sent_output = nlp(sent)
            # split sentences into words
            sent = [word_tokenize(sent)]
            #print(sent)
            # Begin Step 5: context level pattern matching with regex
            for position in range(0, len(sent[0])):
                word = sent[0][position]
                # age check
                if word.isdigit() and int(word) > 90:
                    if position <= 2:  # check the words before age
                        word_previous = ' '.join(sent[0][:position])
                    else:
                        word_previous = ' '.join(sent[0][position -
                                                         2:position])
                    if position >= len(
                            sent[0]) - 2:  # check the words after age
                        word_after = ' '.join(sent[0][position + 1:])
                    else:
                        word_after = ' '.join(sent[0][position + 1:position +
                                                      3])

                    age_string = str(word_previous) + str(word_after)
                    if pattern_age.findall(age_string) != []:
                        screened_words.append(sent[0][position])
                        sent[0][position] = '**PHI**'
                        safe = False

                # address check
                elif (position >= 1 and position < len(sent[0]) - 1 and
                      (word.lower() in address_indictor or
                       (word.lower() == 'dr' and sent[0][position + 1] != '.'))
                      and (word.istitle() or word.isupper())):

                    if sent[0][position - 1].istitle() or sent[0][position -
                                                                  1].isupper():
                        screened_words.append(sent[0][position - 1])
                        sent[0][position - 1] = '**PHI**'
                        i = position - 1
                        # find the closet number, should be the number of street
                        while True:
                            if re.findall(r'^[\d-]+$', sent[0][i]) != []:
                                begin_position = i
                                break
                            elif i == 0 or position - i > 5:
                                begin_position = position
                                break
                            else:
                                i -= 1
                        i = position + 1
                        # block the info of city, state, apt number, etc.
                        while True:
                            if '**PHIPostal**' in sent[0][i]:
                                end_position = i
                                break
                            elif i == len(sent[0]) - 1:
                                end_position = position
                                break
                            else:
                                i += 1
                        if end_position <= position:
                            end_position = position

                        for i in range(begin_position, end_position):
                            #if sent[0][i] != '**PHIPostal**':
                            screened_words.append(sent[0][i])
                            sent[0][i] = '**PHI**'
                            safe = False

            # Begin Step 6: NLTK POS tagging
            sent_tag = nltk.pos_tag_sents(sent)
            #try:
            # senna cannot handle long sentence.
            #sent_tag = [[]]
            #length_100 = len(sent[0])//100
            #for j in range(0, length_100+1):
            #[sent_tag[0].append(j) for j in pretrain.tag(sent[0][100*j:100*(j+1)])]
            # hunpos needs to change the type from bytes to string
            #print(sent_tag[0])
            #sent_tag = [pretrain.tag(sent[0])]
            #for j in range(len(sent_tag[0])):
            #sent_tag[0][j] = list(sent_tag[0][j])
            #sent_tag[0][j][1] = sent_tag[0][j][1].decode('utf-8')
            #except:
            #print('POS error:', tail, sent[0])
            #sent_tag = nltk.pos_tag_sents(sent)
            # Begin Step 7: Use both NLTK and Spacy to check if the word is a name based on sentence level NER label for the word.
            for ent in spcy_sent_output.ents:  # spcy_sent_output contains a dict with each word in the sentence and its NLP labels
                #spcy_sent_ouput.ents is a list of dictionaries containing chunks of words (phrases) that spacy believes are Named Entities
                # Each ent has 2 properties: text which is the raw word, and label_ which is the NER category for the word
                if ent.label_ == 'PERSON':
                    #print(ent.text)
                    # if word is person, recheck that spacy still thinks word is person at the word level
                    spcy_chunk_output = nlp(ent.text)
                    if spcy_chunk_output.ents != (
                    ) and spcy_chunk_output.ents[0].label_ == 'PERSON':
                        # Now check to see what labels NLTK provides for the word
                        name_tag = word_tokenize(ent.text)
                        # senna & hunpos
                        #name_tag = pretrain.tag(name_tag)
                        # hunpos needs to change the type from bytes to string
                        #for j in range(len(name_tag)):
                        #name_tag[j] = list(name_tag[j])
                        #name_tag[j][1] = name_tag[j][1].decode('utf-8')
                        #chunked = ne_chunk(name_tag)
                        # default
                        name_tag = pos_tag_sents([name_tag])
                        chunked = ne_chunk(name_tag[0])
                        for i in chunked:
                            if type(
                                    i
                            ) == Tree:  # if ne_chunck thinks chunk is NER, creates a tree structure were leaves are the words in the chunk (and their POS labels) and the trunk is the single NER label for the chunk
                                if i.label() == 'PERSON':
                                    for token, pos in i.leaves():
                                        if pos == 'NNP':
                                            name_set.add(token)

                                else:
                                    for token, pos in i.leaves():
                                        spcy_upper_output = nlp(token.upper())
                                        if spcy_upper_output.ents != ():
                                            name_set.add(token)

            # BEGIN STEP 8: whitelist check
            # sent_tag is the nltk POS tagging for each word at the sentence level.
            for i in range(len(sent_tag[0])):
                # word contains the i-th word and it's POS tag
                word = sent_tag[0][i]
                # print(word)
                # word_output is just the raw word itself
                word_output = word[0]

                if word_output not in string.punctuation:
                    word_check = str(pattern_word.sub('', word_output))
                    #if word_check.title() in ['Dr', 'Mr', 'Mrs', 'Ms']:
                    #print(word_check)
                    # remove the speical chars
                    try:
                        # word[1] is the pos tag of the word

                        if (((word[1] == 'NN' or word[1] == 'NNP')
                             or ((word[1] == 'NNS' or word[1] == 'NNPS')
                                 and word_check.istitle()))):
                            if word_check.lower() not in whitelist_dict:
                                screened_words.append(word_output)
                                word_output = "**PHI**"
                                safe = False
                            else:
                                # For words that are in whitelist, check to make sure that we have not identified them as names
                                if ((word_output.istitle()
                                     or word_output.isupper())
                                        and pattern_name.findall(word_output)
                                        != []
                                        and re.search(r'\b([A-Z])\b',
                                                      word_check) is None):
                                    word_output, name_set, screened_words, safe = namecheck(
                                        word_output, name_set, screened_words,
                                        safe)

                        # check day/year according to the month name
                        elif word[1] == 'CD':
                            if i > 2:
                                context_before = sent_tag[0][i - 3:i]
                            else:
                                context_before = sent_tag[0][0:i]
                            if i <= len(sent_tag[0]) - 4:
                                context_after = sent_tag[0][i + 1:i + 4]
                            else:
                                context_after = sent_tag[0][i + 1:]
                            #print(word_output, context_before+context_after)
                            for j in (context_before + context_after):
                                if pattern_mname.search(j[0]) is not None:
                                    screened_words.append(word_output)
                                    #print(word_output)
                                    word_output = "**PHI**"
                                    safe = False
                                    break
                        else:
                            word_output, name_set, screened_words, safe = namecheck(
                                word_output, name_set, screened_words, safe)

                    except:
                        print(word_output, sys.exc_info())
                    if word_output.lower()[0] == '\'s':
                        if phi_reduced[-7:] != '**PHI**':
                            phi_reduced = phi_reduced + word_output
                        #print(word_output)
                    else:
                        phi_reduced = phi_reduced + ' ' + word_output
                # Format output for later use by eval.py
                else:
                    if (i > 0
                            and sent_tag[0][i - 1][0][-1] in string.punctuation
                            and sent_tag[0][i - 1][0][-1] != '*'):
                        phi_reduced = phi_reduced + word_output
                    elif word_output == '.' and sent_tag[0][i - 1][0] in [
                            'Dr', 'Mr', 'Mrs', 'Ms'
                    ]:
                        phi_reduced = phi_reduced + word_output
                    else:
                        phi_reduced = phi_reduced + ' ' + word_output
            #print(phi_reduced)

            # Begin Step 8: check middle initial and month name
            if pattern_mname.findall(phi_reduced) != []:
                for item in pattern_mname.findall(phi_reduced):
                    screened_words.append(item[0])
            phi_reduced = pattern_mname.sub('**PHI**', phi_reduced)

            if pattern_middle.findall(phi_reduced) != []:
                for item in pattern_middle.findall(phi_reduced):
                    #    print(item[0])
                    screened_words.append(item[0])
            phi_reduced = pattern_middle.sub('**PHI** **PHI** ', phi_reduced)
        # print(phi_reduced)

        if not safe:
            phi_containing_records = 1

        # save phi_reduced file
        filename = '.'.join(tail.split('.')[:-1]) + "_" + key_name + ".txt"
        filepath = os.path.join(foutpath, filename)
        with open(filepath, "w") as phi_reduced_note:
            phi_reduced_note.write(phi_reduced)

        # save filtered words
        #screened_words = list(filter(lambda a: a!= '**PHI**', screened_words))
        filepath = os.path.join(foutpath, 'filter_summary.txt')
        #print(filepath)
        screened_words = list(
            filter(lambda a: '**PHI' not in a, screened_words))
        #screened_words = list(filter(lambda a: a != '**PHI**', screened_words))
        #print(screened_words)
        with open(filepath, 'a') as fout:
            fout.write('.'.join(tail.split('.')[:-1]) + ' ' +
                       str(len(screened_words)) + ' ' +
                       ' '.join(screened_words) + '\n')
            # fout.write(' '.join(screened_words))

        print(total_records, f,
              "--- %s seconds ---" % (time.time() - start_time_single))
        # hunpos needs to close session
        #pretrain.close()
        return total_records, phi_containing_records
Exemple #15
0
 def __init__(self):
     self.tagger = SennaTagger('/app/util/senna')
class CRF_Extractor:
    '''
    extract features for the CRF model
    each line is a feature vector for a token
    '''
    def __init__(self):
        '''
        if phrase_dict_json != None: extract the phrase features
        if subtype_flag = True, extract the features by sub parse_type
        if bioe_flag = True, use the BIOE tags
        '''
        self.features = ['pos', 'chunk', 'promptword', 'stopword', 'tf', 'rank', 'color']
        
        if 'pos' in self.features:
            self.pos_tagger = SennaTagger(global_params.sennadir)
        
        if 'chunk' in self.features:
            self.chunk_tagger = SennaChunkTagger(global_params.sennadir)
        
        self.sentences = []
        
        self.porter = PorterStemmer()
        
        self.token_dict = None
        self.bins = 50
    
    def add_sentence(self, sentence):
        self.sentences.append(sentence)
    
    def get_token_tf(self):
        self.token_dict = defaultdict(float)
        
        for tokens, _, _ in self.sentences:
            for token in self.porter.stem_tokens(tokens):
                self.token_dict[token] += 1.0
        
        self.rank_dict = defaultdict(int)
        rank_tokens = sorted(self.token_dict, key=self.token_dict.get, reverse=True)
        
        self.rank_dict = defaultdict(int)
        for i, token in enumerate(rank_tokens):
            self.rank_dict[token] = int(i*10/len(rank_tokens))
        
        for t, v in self.token_dict.items(): #normalized by the number of sentences
            x = v/len(self.sentences)
            if x > 1.0: x = 1.0
            
            self.token_dict[t] = x
        
    def get_feature_names(self):
        return '_'.join(self.features)
    
    def get_i_j(self, body, i, j):
        '''
        return the value of the crf template feature u[i, j]
        intput: 
            body: [][], two-dimentionary array, representing the crf features for a sentence
            i: int, the index of i
            j: int, the index of j
        '''
        n = len(body)
        if i < 0:
            v = '_x%d'%(i)
        elif i >= n:
            v = '_x+%d'%(i-n+1)
        else:
            v = body[i][j]
        return v
    
    def extract_U_i_j(self, data_body, feature_body, i, j, tag):
        '''
        extract the U[i, j] feature, and add it to the end of each row
        intput: 
            data_body: [][], two-dimentionary array, representing the crf data for a sentence
            feature_body: [][], two-dimentionary array, the resulting feature data for a sentence
            i: int, the index of i
            j: int, the index of j
            tag: the prefix of the feature name
        '''
        for k, row in enumerate(feature_body):
            row.append('%s:%s'%(tag, self.get_i_j(data_body, k+i, j)))
    
    def extract_U_i_j_m_n(self, data_body, feature_body, i, j, m, n, tag):
        '''
        extract the U[i, j]/U[m, n] feature, and add it to the end of each row
        '''
        for k, row in enumerate(feature_body):
            row.append('%s:%s/%s'%(tag, self.get_i_j(data_body, k+i, j), self.get_i_j(data_body, k+m, n)))
    
    def extract_U_i_j_m_n_x_y(self, data_body, feature_body, i, j, m, n, x, y, tag):
        '''
        extract the U[i, j]/U[m, n]/U[x,y] feature, and add it to the end of each row
        '''
        for k, row in enumerate(feature_body):
            row.append('%s:%s/%s/%s'%(tag, self.get_i_j(data_body, k+i, j), self.get_i_j(data_body, k+m, n), self.get_i_j(data_body, k+x, y)))
    
    def extract_word_U_i_j(self, data_body, index, feature_body, i, j, tag):
        '''
        extract the U[i, j] feature, and add it to the end of each row
        '''
        for k, row in zip(index, feature_body):
            row.append('%s:%s'%(tag, self.get_i_j(data_body, k+i, j)))
    
    def extract_word_U_i_j_m_n(self, data_body, index, feature_body, i, j, m, n, tag):
        '''
        extract the U[i, j]/U[m, n] feature, and add it to the end of each row
        '''
        for k, row in zip(index, feature_body):
            row.append('%s:%s/%s'%(tag, self.get_i_j(data_body, k+i, j), self.get_i_j(data_body, k+m, n)))
    
    def extract_word_U_i_j_m_n_x_y(self, data_body, index, feature_body, i, j, m, n, x, y, tag):
        '''
        extract the U[i, j]/U[m, n] feature, and add it to the end of each row
        '''
        for k, row in zip(index, feature_body):
            row.append('%s:%s/%s/%s'%(tag, self.get_i_j(data_body, k+i, j), self.get_i_j(data_body, k+m, n), self.get_i_j(data_body, k+x, y)))
            
    def extract_bigram(self, body):
        '''
        extract the bigram feature for the crf template
        '''
        for row in body:
            row.append('b')
    
    def extract_crf_features(self, tokens, tags, prompt, colors=None):
        '''
        Extract the character features, each token a line
        return: [][], two dimentionary array, representing the feature data of the sentence
        '''
    
        body = []

        words = tokens
        N = len(tokens)
        
        #first row: the word token
        for word in words:
            row = []
            row.append(word)
            body.append(row)
        
        if 'pos' in self.features:
            pos_tags = self.pos_tagger.tag(tokens)
            
            for i, (_, p_tag) in enumerate(pos_tags):
                body[i].append(p_tag)
        
        if 'chunk' in self.features:
            chunk_tags = self.chunk_tagger.tag(tokens)
            
            for i, (_, c_tag) in enumerate(chunk_tags):
                body[i].append(c_tag)
        
        if 'promptword' in self.features:
            for i, token in enumerate(tokens):
                if token in prompt_words[prompt]:
                    body[i].append('Y')
                else:
                    body[i].append('N')
        
        if 'stopword' in self.features:
            for i, token in enumerate(tokens):
                if token in stopwords:
                    body[i].append('Y')
                else:
                    body[i].append('N')
        
        if 'tf' in self.features:
            if self.token_dict == None:
                self.get_token_tf()
            
            for i, token in enumerate(self.porter.stem_tokens(tokens)):
                assert(token in self.token_dict)
                
                x = int(self.token_dict[token]*self.bins)
                body[i].append(str(x))
        
        if 'rank' in self.features:
            if self.rank_dict == None:
                self.get_token_tf()
            
            for i, token in enumerate(self.porter.stem_tokens(tokens)):
                assert(token in self.rank_dict)
                
                x = self.rank_dict[token]
                body[i].append(str(x))        
        
        if 'color' in self.features and colors != None:
            for color in colors:
                for i, tag in enumerate(tags):
                    body[i].append(str(color[i]))
        
        #last row:
        tags = [tag for tag in tags]
        
        for i, tag in enumerate(tags):
            body[i].append(tag)
        
        return body
Exemple #17
0
from nltk.tokenize import RegexpTokenizer, sent_tokenize
from nltk.tag import SennaTagger
from nltk import ne_chunk, pos_tag, defaultdict
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
'''--------------------------------------Exercise 1------------------------------------------------------------------'''
''' Exercise sheet Lab1_TextProcessing.pdf '''

#nltk.download('averaged_perceptron_tagger')
#nltk.download('maxent_ne_chunker')

vectorizer = CountVectorizer()
'''----------------SENNA Tagger-------------------'''
sena_tagger = SennaTagger(
    '/home/starksultana/Documentos/MEIC/5o_ano/1o semestre/PRI/Labs/lab1/senna-v3.0/senna'
)

print("----------------EXERCISE 1-------------------")

#Exercise 1.1


def partition(A, low, high):
    pivot = A[low]
    leftwall = low
    for i in range(low, high + 1):  #ns s ta bem no pseudo dizia low+1
        if (A[i] < pivot):
            leftwall += 1
            A[leftwall], A[i] = A[i], A[leftwall]
    A[leftwall], A[low] = A[low], A[leftwall]
# -*- coding: utf-8 -*-
"""
Created on Sun May 14 12:37:50 2017

@author: Shanika Ediriweera
"""

from nltk import word_tokenize
from nltk.tag import SennaTagger
senna = SennaTagger('../../tools/senna')
sents = ["All the banks are closed", "Today is Sunday"]

tokenized_sents = [word_tokenize(sent) for sent in sents]
print(senna.tag_sents(tokenized_sents))
Exemple #19
0
#Imports
import os, nltk
import re
from nltk.tag import SennaTagger, SennaChunkTagger
from nltk.tokenize import sent_tokenize

#Constants
SOURCE_DIR = '../data/annotated/'
SENNA_INPUT_DIR_RESPS = '../data/senna_input_resps/'
SENNA_INPUT_DIR_SENTS = '../data/senna_input_sents/'
SENNA_DEST_DIR = '../data/senna_wordlist/'
SENNA_EXECUTABLE_DIR = '../../tools/senna'
"""
for now these taggers are not used. SENNA tagging is done mannually using a shell script
"""
pos_tagger = SennaTagger(SENNA_EXECUTABLE_DIR)
chunk_tagger = SennaChunkTagger(SENNA_EXECUTABLE_DIR)


def add_space_between_sentences(text):
    """
    Add space between sentences where no space is added after period
    """
    space_added_txt = re.sub(r"(\w+)\.(\w+)", r"\1. \2", text)
    return space_added_txt


def add_space_between_sentence_and_period(text, text_type):
    """
    Add space between sentence and period.
    This is needed for SENNA to tokenize sentences.
import sys
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tag import SennaTagger

fr = open(sys.argv[1]).read()
sent = sent_tokenize(fr)
aword = word_tokenize(sent[0])

tagger = SennaTagger('/usr/share/senna-v2.0')

for w, t in tagger.tag(aword):
    print(w, t)
Exemple #21
0
class CRF_Extractor:
    '''
    extract features for the CRF model
    each line is a feature vector for a token
    '''
    def __init__(self):
        '''
        if phrase_dict_json != None: extract the phrase features
        if subtype_flag = True, extract the features by sub parse_type
        if bioe_flag = True, use the BIOE tags
        '''
        self.features = [
            'pos', 'chunk', 'promptword', 'stopword', 'tf', 'rank', 'color'
        ]

        if 'pos' in self.features:
            self.pos_tagger = SennaTagger(global_params.sennadir)

        if 'chunk' in self.features:
            self.chunk_tagger = SennaChunkTagger(global_params.sennadir)

        self.sentences = []

        self.porter = PorterStemmer()

        self.token_dict = None
        self.bins = 50

    def add_sentence(self, sentence):
        self.sentences.append(sentence)

    def get_token_tf(self):
        self.token_dict = defaultdict(float)

        for tokens, _, _ in self.sentences:
            for token in self.porter.stem_tokens(tokens):
                self.token_dict[token] += 1.0

        self.rank_dict = defaultdict(int)
        rank_tokens = sorted(self.token_dict,
                             key=self.token_dict.get,
                             reverse=True)

        self.rank_dict = defaultdict(int)
        for i, token in enumerate(rank_tokens):
            self.rank_dict[token] = int(i * 10 / len(rank_tokens))

        for t, v in self.token_dict.items(
        ):  #normalized by the number of sentences
            x = v / len(self.sentences)
            if x > 1.0: x = 1.0

            self.token_dict[t] = x

    def get_feature_names(self):
        return '_'.join(self.features)

    def get_i_j(self, body, i, j):
        '''
        return the value of the crf template feature u[i, j]
        intput: 
            body: [][], two-dimentionary array, representing the crf features for a sentence
            i: int, the index of i
            j: int, the index of j
        '''
        n = len(body)
        if i < 0:
            v = '_x%d' % (i)
        elif i >= n:
            v = '_x+%d' % (i - n + 1)
        else:
            v = body[i][j]
        return v

    def extract_U_i_j(self, data_body, feature_body, i, j, tag):
        '''
        extract the U[i, j] feature, and add it to the end of each row
        intput: 
            data_body: [][], two-dimentionary array, representing the crf data for a sentence
            feature_body: [][], two-dimentionary array, the resulting feature data for a sentence
            i: int, the index of i
            j: int, the index of j
            tag: the prefix of the feature name
        '''
        for k, row in enumerate(feature_body):
            row.append('%s:%s' % (tag, self.get_i_j(data_body, k + i, j)))

    def extract_U_i_j_m_n(self, data_body, feature_body, i, j, m, n, tag):
        '''
        extract the U[i, j]/U[m, n] feature, and add it to the end of each row
        '''
        for k, row in enumerate(feature_body):
            row.append('%s:%s/%s' % (tag, self.get_i_j(
                data_body, k + i, j), self.get_i_j(data_body, k + m, n)))

    def extract_U_i_j_m_n_x_y(self, data_body, feature_body, i, j, m, n, x, y,
                              tag):
        '''
        extract the U[i, j]/U[m, n]/U[x,y] feature, and add it to the end of each row
        '''
        for k, row in enumerate(feature_body):
            row.append('%s:%s/%s/%s' % (tag, self.get_i_j(
                data_body, k + i, j), self.get_i_j(
                    data_body, k + m, n), self.get_i_j(data_body, k + x, y)))

    def extract_word_U_i_j(self, data_body, index, feature_body, i, j, tag):
        '''
        extract the U[i, j] feature, and add it to the end of each row
        '''
        for k, row in zip(index, feature_body):
            row.append('%s:%s' % (tag, self.get_i_j(data_body, k + i, j)))

    def extract_word_U_i_j_m_n(self, data_body, index, feature_body, i, j, m,
                               n, tag):
        '''
        extract the U[i, j]/U[m, n] feature, and add it to the end of each row
        '''
        for k, row in zip(index, feature_body):
            row.append('%s:%s/%s' % (tag, self.get_i_j(
                data_body, k + i, j), self.get_i_j(data_body, k + m, n)))

    def extract_word_U_i_j_m_n_x_y(self, data_body, index, feature_body, i, j,
                                   m, n, x, y, tag):
        '''
        extract the U[i, j]/U[m, n] feature, and add it to the end of each row
        '''
        for k, row in zip(index, feature_body):
            row.append('%s:%s/%s/%s' % (tag, self.get_i_j(
                data_body, k + i, j), self.get_i_j(
                    data_body, k + m, n), self.get_i_j(data_body, k + x, y)))

    def extract_bigram(self, body):
        '''
        extract the bigram feature for the crf template
        '''
        for row in body:
            row.append('b')

    def extract_crf_features(self, tokens, tags, prompt, colors=None):
        '''
        Extract the character features, each token a line
        return: [][], two dimentionary array, representing the feature data of the sentence
        '''

        body = []

        words = tokens
        N = len(tokens)

        #first row: the word token
        for word in words:
            row = []
            row.append(word)
            body.append(row)

        if 'pos' in self.features:
            pos_tags = self.pos_tagger.tag(tokens)

            for i, (_, p_tag) in enumerate(pos_tags):
                body[i].append(p_tag)

        if 'chunk' in self.features:
            chunk_tags = self.chunk_tagger.tag(tokens)

            for i, (_, c_tag) in enumerate(chunk_tags):
                body[i].append(c_tag)

        if 'promptword' in self.features:
            for i, token in enumerate(tokens):
                if token in prompt_words[prompt]:
                    body[i].append('Y')
                else:
                    body[i].append('N')

        if 'stopword' in self.features:
            for i, token in enumerate(tokens):
                if token in stopwords:
                    body[i].append('Y')
                else:
                    body[i].append('N')

        if 'tf' in self.features:
            if self.token_dict == None:
                self.get_token_tf()

            for i, token in enumerate(self.porter.stem_tokens(tokens)):
                assert (token in self.token_dict)

                x = int(self.token_dict[token] * self.bins)
                body[i].append(str(x))

        if 'rank' in self.features:
            if self.rank_dict == None:
                self.get_token_tf()

            for i, token in enumerate(self.porter.stem_tokens(tokens)):
                assert (token in self.rank_dict)

                x = self.rank_dict[token]
                body[i].append(str(x))

        if 'color' in self.features and colors != None:
            for color in colors:
                for i, tag in enumerate(tags):
                    body[i].append(str(color[i]))

        #last row:
        tags = [tag for tag in tags]

        for i, tag in enumerate(tags):
            body[i].append(tag)

        return body
#!/usr/bin/env python
#-*- coding: utf-8 -*-

import sys
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tag import SennaTagger

argvs = sys.argv
argc = len(argvs)

#引数が不正の場合はメッセージを表示する
if (argc != 2):
	print('Usage: # python %s filename' % argvs[0])
	quit()
 
#タガー準備
tagger = SennaTagger('/usr/share/senna-v2.0')

#文分割
openedFile = open(argvs[1]).read()
sent_tokenize_list = sent_tokenize(openedFile)

#1行目の単語分割
word_tokenize_list = word_tokenize(sent_tokenize_list[0])

#タグ付け
for w, t in tagger.tag(word_tokenize_list):
	print(w, t)
    'put': ["perform", "mark", "evaluate", "update", "set", "change", "edit"],
    'delete': ["delete", "destroy", "kill", "remove", "cancel"]
}

# response code lists
L404 = [
    'not found', 'doesn\'t exist', 'does not exist', 'unable to find',
    'can\'t find'
]
L401 = ['unauthorized', 'not allowed', 'rejected', 'denied']
L400 = ['failed', 'unsuccessful']

# st = StanfordPOSTagger("C:/Users/Tasos/OneDriveThesis/Thesis/src/lib/stanford-postagger-full-2015-12-09/models/english-left3words-distsim.tagger",
#                "C:/Users/Tasos/OneDriveThesis/Thesis/src/lib/stanford-postagger-full-2015-12-09/stanford-postagger.jar")

senna_tagger = SennaTagger(
    "C:/Users/Tasos/OneDriveThesis/Thesis/src/lib/senna")
p = inflect.engine()


def resource_analysis(resources, resource_names):
    model = {}
    hateoas_graph = {}
    for resource, scenarios in resources.items():
        hateoas_graph[resource] = []
        model[resource] = {
            'get': {
                'request_params': [],
                'response': {
                    'params': [],
                    'links': []
                }
import sys
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.tag import SennaTagger
f=open(sys.argv[1],'r')
lines=sent_tokenize(f.read())
tagger=SennaTagger('/usr/share/senna-v2.0')
words=word_tokenize(lines[1])    
print(tagger.tag(words))