def __init__(self, inFile, vocab=None): self.fe = Features.FeatureExtractor() Data.__init__(self, inFile) self.tagVocab = Vocab() #Create a vocab for indexing tags self.tagVocab.GetID('START') for t1 in self.tags: self.tagVocab.GetID(t1) self.tagVocab.Lock() self.featurizedSentences = self.ExtractFeatures(vocab)
def __init__(self, pos, chunk, event, classify, mallet_memory='256m'): self.clear_line_counter() self.posTagger = pos_tagger_stdin.PosTagger() if pos else None self.chunkTagger = chunk_tagger_stdin.ChunkTagger() if chunk and pos else None self.eventTagger = event_tagger_stdin.EventTagger() if event and pos else None self.llda = GetLLda() if classify else None if pos and chunk: self.ner_model = 'ner.model' elif pos: self.ner_model = 'ner_nochunk.model' else: self.ner_model = 'ner_nopos_nochunk.model' self.ner = GetNer(self.ner_model, memory=mallet_memory) self.fe = Features.FeatureExtractor('%s/data/dictionaries' % (BASE_DIR)) self.capClassifier = cap_classifier.CapClassifier() self.vocab = Vocab('%s/hbc/data/vocab' % (BASE_DIR)) self.dictMap = {} self.dictMap = self.dictMap i = 1 for line in open('%s/hbc/data/dictionaries' % (BASE_DIR)): dictionary = line.rstrip('\n') self.dictMap[i] = dictionary i += 1 self.dict2index = {} for i in self.dictMap.keys(): self.dict2index[self.dictMap[i]] = i if self.llda: self.dictionaries = Dictionaries('%s/data/LabeledLDA_dictionaries3' % (BASE_DIR), self.dict2index) self.entityMap = {} i = 0 if self.llda: for line in open('%s/hbc/data/entities' % (BASE_DIR)): entity = line.rstrip('\n') self.entityMap[entity] = i i += 1 self.dict2label = {} for line in open('%s/hbc/data/dict-label3' % (BASE_DIR)): (dictionary, label) = line.rstrip('\n').split(' ') self.dict2label[dictionary] = label
def __init__(self): self.numberLines = 0 self.eventTagger = None self.posTagger = None self.chunkTagger = None self.llda = GetLLda() self.ner = GetNer('ner_nopos_nochunk.model') self.fe = Features.FeatureExtractor('%s/data/dictionaries' % (BASE_DIR)) self.capClassifier = cap_classifier.CapClassifier() self.vocab = Vocab('%s/hbc/data/vocab' % (BASE_DIR)) self.dictMap = {} i = 1 for line in open('%s/hbc/data/dictionaries' % (BASE_DIR)): dictionary = line.rstrip('\n') self.dictMap[i] = dictionary i += 1 dict2index = {} for i in self.dictMap.keys(): dict2index[self.dictMap[i]] = i if self.llda: self.dictionaries = Dictionaries( '%s/data/LabeledLDA_dictionaries3' % (BASE_DIR), dict2index) self.entityMap = {} i = 0 if self.llda: for line in open('%s/hbc/data/entities' % (BASE_DIR)): entity = line.rstrip('\n') self.entityMap[entity] = i i += 1 self.dict2label = {} for line in open('%s/hbc/data/dict-label3' % (BASE_DIR)): (dictionary, label) = line.rstrip('\n').split(' ') self.dict2label[dictionary] = label
for item in scores: computedScores[item] = (scores[item]+ 1)/((scores[item] + len(scores))*1.0) def displayScores(): for item in computedScores: print(str(item) + " : " + str(computedScores[item])) posTagger = pos_tagger_stdin.PosTagger() chunkTagger = chunk_tagger_stdin.ChunkTagger() eventTagger = event_tagger_stdin.EventTagger() llda = GetLLda() ner_model = 'ner.model' ner = GetNer(ner_model) fe = Features.FeatureExtractor('%s/data/dictionaries' % (BASE_DIR)) capClassifier = cap_classifier.CapClassifier() vocab = Vocab('%s/hbc/data/vocab' % (BASE_DIR)) dictMap = {} i = 1 for line in open('%s/hbc/data/dictionaries' % (BASE_DIR)): dictionary = line.rstrip('\n') dictMap[i] = dictionary i += 1 dict2index = {} for i in dictMap.keys():
def __init__(self): self.fe = Features.FeatureExtractor('%s/data/dictionaries_event' % (BASE_DIR)) self.GetTagger() self.nTagged = 0
#!/usr/bin/python import sys sys.path.append('python/cap') sys.path.append('python') import Features fe = Features.FeatureExtractor() cap = Features.CapClassifier() entityConf = {} #First pass; confidence for line in open(sys.argv[1]): line = line.rstrip('\n') fields = line.split('\t') sid = fields[0] date = fields[9][0:10] confidence = 1.0 / float(fields[-1]) eType = fields[-2] entity = fields[-3] neTags = fields[-4].split(' ') pos = fields[-5].split(' ') words = fields[-6].split(' ') key = "%s\t%s\t%s" % (entity, eType, date) if entityConf.has_key(key): entityConf[key] = max(entityConf.get(key), confidence) else:
for i in range(len(q)): if q[i]: quotes[start + i] = 1 startSentence[start] = end endSentence[end - 1] = 1 sentences = [words[i:startSentence[i]] for i in startSentence.keys()] sentenceTags = [tags[i:startSentence[i]] for i in startSentence.keys()] posChunk = Tag(sentences) #Print out the data #posTagger = pos_tagger_stdin.PosTagger() capClassifier = cap_classifier.CapClassifier() if "NOBROWN" in FEATURES: fe = Features.FeatureExtractor("data/dictionaries", None) else: fe = Features.FeatureExtractor("data/dictionaries") nSentences = 0 # for i in range(len(words)): # if startSentence.has_key(i): # sentenceWords = words[i:startSentence[i]] for sentenceWords in sentences: if "NOPOS" in FEATURES: pos = None elif "NEWSPOS" in FEATURES: pos = [x[1] for x in nltk.pos_tag(sentenceWords)] else: #pos = posTagger.TagSentence(sentenceWords) pos = [x[1] for x in posChunk[nSentences]]
TAG_MAP = { "PERSON": "ENTITY", "LOCATION": "ENTITY", "ORGANIZATION": "MUC_ORGANIZATION", } def mapTag(t): if TAG_MAP.has_key(t): return TAG_MAP[t] else: return t fe = Features.FeatureExtractor('data/dictionaries') def PrintFeatures(sentences): for s in sentences: words = s.split(' ') pos = [x[1] for x in nltk.pos_tag(words)] tags = [] tag = None last = True for i in range(len(words)): mstart = re.search(r'^XXX([A-Z]+)-', words[i]) mend = re.search(r'-([A-Z]+)XXX$', words[i]) if mstart: tag = "B-%s" % mapTag(mstart.group(1))