def __init__(self, pos, chunk, event, classify, mallet_memory='256m'): self.clear_line_counter() self.posTagger = pos_tagger_stdin.PosTagger() if pos else None self.chunkTagger = chunk_tagger_stdin.ChunkTagger() if chunk and pos else None self.eventTagger = event_tagger_stdin.EventTagger() if event and pos else None self.llda = GetLLda() if classify else None if pos and chunk: self.ner_model = 'ner.model' elif pos: self.ner_model = 'ner_nochunk.model' else: self.ner_model = 'ner_nopos_nochunk.model' self.ner = GetNer(self.ner_model, memory=mallet_memory) self.fe = Features.FeatureExtractor('%s/data/dictionaries' % (BASE_DIR)) self.capClassifier = cap_classifier.CapClassifier() self.vocab = Vocab('%s/hbc/data/vocab' % (BASE_DIR)) self.dictMap = {} self.dictMap = self.dictMap i = 1 for line in open('%s/hbc/data/dictionaries' % (BASE_DIR)): dictionary = line.rstrip('\n') self.dictMap[i] = dictionary i += 1 self.dict2index = {} for i in self.dictMap.keys(): self.dict2index[self.dictMap[i]] = i if self.llda: self.dictionaries = Dictionaries('%s/data/LabeledLDA_dictionaries3' % (BASE_DIR), self.dict2index) self.entityMap = {} i = 0 if self.llda: for line in open('%s/hbc/data/entities' % (BASE_DIR)): entity = line.rstrip('\n') self.entityMap[entity] = i i += 1 self.dict2label = {} for line in open('%s/hbc/data/dict-label3' % (BASE_DIR)): (dictionary, label) = line.rstrip('\n').split(' ') self.dict2label[dictionary] = label
#return subprocess.Popen('java -Xmx256m -cp %s/mallet-2.0.6/lib/mallet-deps.jar:%s/mallet-2.0.6/class cc.mallet.fst.SimpleTaggerStdin --weights sparse --model-file %s/models/ner/%s' % (BASE_DIR, BASE_DIR, BASE_DIR, ner_model), return subprocess.Popen('java -Xmx512m -cp %s/mallet-2.0.6/lib/mallet-deps.jar:%s/mallet-2.0.6/class cc.mallet.fst.SimpleTaggerStdin --weights sparse --model-file %s/models/ner/%s' % (BASE_DIR, BASE_DIR, BASE_DIR, ner_model), shell=True, close_fds=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE) def GetLLda(): return subprocess.Popen('%s/hbc/models/LabeledLDA_infer_stdin.out %s/hbc/data/combined.docs.hbc %s/hbc/data/combined.z.hbc 100 100' % (BASE_DIR, BASE_DIR, BASE_DIR), shell=True, close_fds=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE) def recomputeScores(): for item in scores: computedScores[item] = (scores[item]+ 1)/((scores[item] + len(scores))*1.0) def displayScores(): for item in computedScores: print(str(item) + " : " + str(computedScores[item])) posTagger = pos_tagger_stdin.PosTagger() chunkTagger = chunk_tagger_stdin.ChunkTagger() eventTagger = event_tagger_stdin.EventTagger() llda = GetLLda() ner_model = 'ner.model' ner = GetNer(ner_model) fe = Features.FeatureExtractor('%s/data/dictionaries' % (BASE_DIR)) capClassifier = cap_classifier.CapClassifier() vocab = Vocab('%s/hbc/data/vocab' % (BASE_DIR)) dictMap = {} i = 1