Ejemplo n.º 1
0
    def __init__(self, inFile, vocab=None):
        self.fe = Features.FeatureExtractor()
        Data.__init__(self, inFile)

        self.tagVocab = Vocab()  #Create a vocab for indexing tags
        self.tagVocab.GetID('START')
        for t1 in self.tags:
            self.tagVocab.GetID(t1)
        self.tagVocab.Lock()
        self.featurizedSentences = self.ExtractFeatures(vocab)
 def __init__(self, pos, chunk, event, classify, mallet_memory='256m'):
     self.clear_line_counter()
     
     self.posTagger = pos_tagger_stdin.PosTagger() if pos else None
     self.chunkTagger = chunk_tagger_stdin.ChunkTagger() if chunk and pos else None
     self.eventTagger = event_tagger_stdin.EventTagger() if event and pos else None
     self.llda = GetLLda() if classify else None
     
     if pos and chunk:
         self.ner_model = 'ner.model'
     elif pos:
         self.ner_model = 'ner_nochunk.model'
     else:
         self.ner_model = 'ner_nopos_nochunk.model'
     
     self.ner = GetNer(self.ner_model, memory=mallet_memory)
     self.fe = Features.FeatureExtractor('%s/data/dictionaries' % (BASE_DIR))
     self.capClassifier = cap_classifier.CapClassifier()
     self.vocab = Vocab('%s/hbc/data/vocab' % (BASE_DIR))
     
     self.dictMap = {}
     self.dictMap = self.dictMap
     i = 1
     for line in open('%s/hbc/data/dictionaries' % (BASE_DIR)):
         dictionary = line.rstrip('\n')
         self.dictMap[i] = dictionary
         i += 1
     
     self.dict2index = {}
     for i in self.dictMap.keys():
         self.dict2index[self.dictMap[i]] = i
     
     if self.llda:
         self.dictionaries = Dictionaries('%s/data/LabeledLDA_dictionaries3' % (BASE_DIR), self.dict2index)
     self.entityMap = {}
     i = 0
     if self.llda:
         for line in open('%s/hbc/data/entities' % (BASE_DIR)):
             entity = line.rstrip('\n')
             self.entityMap[entity] = i
             i += 1
     
     self.dict2label = {}
     for line in open('%s/hbc/data/dict-label3' % (BASE_DIR)):
         (dictionary, label) = line.rstrip('\n').split(' ')
         self.dict2label[dictionary] = label
Ejemplo n.º 3
0
    def __init__(self):
        self.numberLines = 0
        self.eventTagger = None
        self.posTagger = None
        self.chunkTagger = None
        self.llda = GetLLda()
        self.ner = GetNer('ner_nopos_nochunk.model')
        self.fe = Features.FeatureExtractor('%s/data/dictionaries' %
                                            (BASE_DIR))
        self.capClassifier = cap_classifier.CapClassifier()

        self.vocab = Vocab('%s/hbc/data/vocab' % (BASE_DIR))

        self.dictMap = {}
        i = 1
        for line in open('%s/hbc/data/dictionaries' % (BASE_DIR)):
            dictionary = line.rstrip('\n')
            self.dictMap[i] = dictionary
            i += 1

        dict2index = {}
        for i in self.dictMap.keys():
            dict2index[self.dictMap[i]] = i

        if self.llda:
            self.dictionaries = Dictionaries(
                '%s/data/LabeledLDA_dictionaries3' % (BASE_DIR), dict2index)
        self.entityMap = {}
        i = 0
        if self.llda:
            for line in open('%s/hbc/data/entities' % (BASE_DIR)):
                entity = line.rstrip('\n')
                self.entityMap[entity] = i
                i += 1

        self.dict2label = {}
        for line in open('%s/hbc/data/dict-label3' % (BASE_DIR)):
            (dictionary, label) = line.rstrip('\n').split(' ')
            self.dict2label[dictionary] = label
Ejemplo n.º 4
0
    for item in scores:
	computedScores[item] = (scores[item]+ 1)/((scores[item] + len(scores))*1.0)

def displayScores():
    for item in computedScores:
	print(str(item) + " : " + str(computedScores[item]))


posTagger = pos_tagger_stdin.PosTagger()
chunkTagger = chunk_tagger_stdin.ChunkTagger()
eventTagger = event_tagger_stdin.EventTagger()
llda = GetLLda()

ner_model = 'ner.model'
ner = GetNer(ner_model)
fe = Features.FeatureExtractor('%s/data/dictionaries' % (BASE_DIR))


capClassifier = cap_classifier.CapClassifier()

vocab = Vocab('%s/hbc/data/vocab' % (BASE_DIR))

dictMap = {}
i = 1
for line in open('%s/hbc/data/dictionaries' % (BASE_DIR)):
    dictionary = line.rstrip('\n')
    dictMap[i] = dictionary
    i += 1

dict2index = {}
for i in dictMap.keys():
Ejemplo n.º 5
0
 def __init__(self):
     self.fe = Features.FeatureExtractor('%s/data/dictionaries_event' % (BASE_DIR))
     self.GetTagger()
     self.nTagged = 0
Ejemplo n.º 6
0
#!/usr/bin/python

import sys

sys.path.append('python/cap')
sys.path.append('python')

import Features

fe = Features.FeatureExtractor()
cap = Features.CapClassifier()

entityConf = {}
#First pass; confidence
for line in open(sys.argv[1]):
    line = line.rstrip('\n')
    fields = line.split('\t')

    sid = fields[0]
    date = fields[9][0:10]
    confidence = 1.0 / float(fields[-1])
    eType = fields[-2]
    entity = fields[-3]
    neTags = fields[-4].split(' ')
    pos = fields[-5].split(' ')
    words = fields[-6].split(' ')

    key = "%s\t%s\t%s" % (entity, eType, date)
    if entityConf.has_key(key):
        entityConf[key] = max(entityConf.get(key), confidence)
    else:
Ejemplo n.º 7
0
            for i in range(len(q)):
                if q[i]:
                    quotes[start + i] = 1

            startSentence[start] = end
            endSentence[end - 1] = 1

    sentences = [words[i:startSentence[i]] for i in startSentence.keys()]
    sentenceTags = [tags[i:startSentence[i]] for i in startSentence.keys()]
    posChunk = Tag(sentences)

    #Print out the data
    #posTagger = pos_tagger_stdin.PosTagger()
    capClassifier = cap_classifier.CapClassifier()
    if "NOBROWN" in FEATURES:
        fe = Features.FeatureExtractor("data/dictionaries", None)
    else:
        fe = Features.FeatureExtractor("data/dictionaries")
    nSentences = 0
    #    for i in range(len(words)):
    #        if startSentence.has_key(i):
    #            sentenceWords = words[i:startSentence[i]]
    for sentenceWords in sentences:
        if "NOPOS" in FEATURES:
            pos = None
        elif "NEWSPOS" in FEATURES:
            pos = [x[1] for x in nltk.pos_tag(sentenceWords)]
        else:
            #pos = posTagger.TagSentence(sentenceWords)
            pos = [x[1] for x in posChunk[nSentences]]
Ejemplo n.º 8
0
TAG_MAP = {
    "PERSON": "ENTITY",
    "LOCATION": "ENTITY",
    "ORGANIZATION": "MUC_ORGANIZATION",
}


def mapTag(t):
    if TAG_MAP.has_key(t):
        return TAG_MAP[t]
    else:
        return t


fe = Features.FeatureExtractor('data/dictionaries')


def PrintFeatures(sentences):
    for s in sentences:
        words = s.split(' ')
        pos = [x[1] for x in nltk.pos_tag(words)]
        tags = []

        tag = None
        last = True
        for i in range(len(words)):
            mstart = re.search(r'^XXX([A-Z]+)-', words[i])
            mend = re.search(r'-([A-Z]+)XXX$', words[i])
            if mstart:
                tag = "B-%s" % mapTag(mstart.group(1))