def __init__(self, pos, chunk, event, classify, mallet_memory='256m'):
     self.clear_line_counter()
     
     self.posTagger = pos_tagger_stdin.PosTagger() if pos else None
     self.chunkTagger = chunk_tagger_stdin.ChunkTagger() if chunk and pos else None
     self.eventTagger = event_tagger_stdin.EventTagger() if event and pos else None
     self.llda = GetLLda() if classify else None
     
     if pos and chunk:
         self.ner_model = 'ner.model'
     elif pos:
         self.ner_model = 'ner_nochunk.model'
     else:
         self.ner_model = 'ner_nopos_nochunk.model'
     
     self.ner = GetNer(self.ner_model, memory=mallet_memory)
     self.fe = Features.FeatureExtractor('%s/data/dictionaries' % (BASE_DIR))
     self.capClassifier = cap_classifier.CapClassifier()
     self.vocab = Vocab('%s/hbc/data/vocab' % (BASE_DIR))
     
     self.dictMap = {}
     self.dictMap = self.dictMap
     i = 1
     for line in open('%s/hbc/data/dictionaries' % (BASE_DIR)):
         dictionary = line.rstrip('\n')
         self.dictMap[i] = dictionary
         i += 1
     
     self.dict2index = {}
     for i in self.dictMap.keys():
         self.dict2index[self.dictMap[i]] = i
     
     if self.llda:
         self.dictionaries = Dictionaries('%s/data/LabeledLDA_dictionaries3' % (BASE_DIR), self.dict2index)
     self.entityMap = {}
     i = 0
     if self.llda:
         for line in open('%s/hbc/data/entities' % (BASE_DIR)):
             entity = line.rstrip('\n')
             self.entityMap[entity] = i
             i += 1
     
     self.dict2label = {}
     for line in open('%s/hbc/data/dict-label3' % (BASE_DIR)):
         (dictionary, label) = line.rstrip('\n').split(' ')
         self.dict2label[dictionary] = label
Exemple #2
0
    #return subprocess.Popen('java -Xmx256m -cp %s/mallet-2.0.6/lib/mallet-deps.jar:%s/mallet-2.0.6/class cc.mallet.fst.SimpleTaggerStdin --weights sparse --model-file %s/models/ner/%s' % (BASE_DIR, BASE_DIR, BASE_DIR, ner_model),
    return subprocess.Popen('java -Xmx512m -cp %s/mallet-2.0.6/lib/mallet-deps.jar:%s/mallet-2.0.6/class cc.mallet.fst.SimpleTaggerStdin --weights sparse --model-file %s/models/ner/%s' % (BASE_DIR, BASE_DIR, BASE_DIR, ner_model), shell=True, close_fds=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE)

def GetLLda():
    return subprocess.Popen('%s/hbc/models/LabeledLDA_infer_stdin.out %s/hbc/data/combined.docs.hbc %s/hbc/data/combined.z.hbc 100 100' % (BASE_DIR, BASE_DIR, BASE_DIR), shell=True, close_fds=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE)

def recomputeScores():
    for item in scores:
	computedScores[item] = (scores[item]+ 1)/((scores[item] + len(scores))*1.0)

def displayScores():
    for item in computedScores:
	print(str(item) + " : " + str(computedScores[item]))


posTagger = pos_tagger_stdin.PosTagger()
chunkTagger = chunk_tagger_stdin.ChunkTagger()
eventTagger = event_tagger_stdin.EventTagger()
llda = GetLLda()

ner_model = 'ner.model'
ner = GetNer(ner_model)
fe = Features.FeatureExtractor('%s/data/dictionaries' % (BASE_DIR))


capClassifier = cap_classifier.CapClassifier()

vocab = Vocab('%s/hbc/data/vocab' % (BASE_DIR))

dictMap = {}
i = 1