def makeBuckets(self, n_buck, heldout): n = self._n counts = ADict() for line in heldout: for ngram in NGramIter(n, line, ['<s>']*(n-1)): hist = ngram[:-1] counts[hist] += 1. sum_hist = sum(counts.values()) f_max = float(sum_hist) / float(n_buck) b = [set()] b_sum = 0. for hist, count in sorted(counts.iteritems(), key=lambda i: i[1], reverse=True): if count == 1: continue if b_sum + count > f_max: b.append(set()) b_sum = 0. b[-1].add(hist) b_sum += count ret = {} params = [] for idx, bucket in enumerate(b): params.append([1./(n+1.) for i in range(n+1)]) for hist in bucket: ret[hist] = idx if n_buck != 1: params.append([1./(n+1.) for i in range(n+1)]) return ret, params
def main(self, files, conceptMap, symMap, outDir, symName='word', extraExt='', obsColumn=0, negExamples=None, posExamples=None, writeConst=None, appendConst=None): if writeConst is not None and appendConst is not None: raise ValueError("Use only writeConst or appendConst, not both") self.setupMaps(conceptMap, symMap) globfiles = set() for fn in files: if os.path.isdir(fn): globfiles |= set(glob(os.path.join(fn, '*.hddn'))) else: globfiles.add(fn) file_ids = set() for fn in globfiles: file_id = fn.replace('.hddn', '') file_ids.add(file_id) file_ids = sorted(file_ids) if negExamples and negExamples != 'off': neg_ex = ADict.readFromFile(negExamples) neg_ex = self.mapExamples(neg_ex) else: neg_ex = {} if posExamples and posExamples != 'off': pos_ex = ADict.readFromFile(posExamples) pos_ex = self.mapExamples(pos_ex) else: pos_ex = {} if writeConst: constFile = writeConst constAppend = False else: constFile = appendConst constAppend = True ho_generator = self.generateHO(file_ids, extraExt, obsColumn) dts, histories = self.makeDTS(ho_generator) word_C = self.makeSparseProb(histories, pos_ex, neg_ex) self.storeResults(outDir, symName, dts, word_C, constFile, constAppend)
def fsmDecode(self, model_dir, lines): start = None heap = ADict(default=list) #for dcdline in self.fsmrawdecode(model_dir, stdin=lines): for dcdline in fsmrawdecode(model_dir, lines, debug=self.debugMain): dcdsplit = dcdline.split() if len(dcdsplit) == 1: continue elif len(dcdsplit) == 5: s1, s2, w, stack, weight = dcdsplit elif len(dcdsplit) == 4: s1, s2, w, stack = dcdsplit weight = 0. else: raise ValueError("Bad output line from decoder: %r" % dcdline) s1 = int(s1) s2 = int(s2) w = int(w) stack = int(stack) weight = float(weight) if start is None: start = s1 heap[s1].append((s2, w, stack, weight)) while heap[start]: new_state = start while new_state in heap: trans = heap[new_state].pop(0) if not heap[new_state]: del heap[new_state] old_state = new_state new_state, w, stack, weight = trans yield w, stack, weight
def collectNeg_basic(self, stacks, words): neg_ex = ADict() concepts = set() for stack in stacks: concepts |= set(stack) for rootConcept in DOMINATING_CONCEPT: if rootConcept in concepts: # the utterance can be used as negative example for leafConcepts in DEPENDENT_CONCEPTS: # you have one group of dependent concepts (generally numbers) ok = True for leafConcept in leafConcepts: if leafConcept in concepts: ok = False if ok: # all words in the utterance can be used as negative # exapmles beceause they are not connected with # lexConceptsDependent (the example does not contain # lexConceptsDependent) for leafConcept in leafConcepts: for w in words: neg_ex[leafConcept, w] += 1 # the example was already used break return neg_ex
def main(self, files, dataset, default_datasets, posout=None, negout=None, pos_method='basic', neg_method='basic', disable_ne=False, disable_common=False): if pos_method not in ['basic', 'netyped', 'off']: raise ValueError("Unsupported pos_method: %s" % pos_method) if neg_method not in ['basic', 'derived', 'derclass', 'off']: raise ValueError("Unsupported neg_method: %s" % neg_method) default_dataset, common_dataset = default_datasets.split(',') if disable_common: common_dataset = dataset self.disable_ne = disable_ne files = self.globFiles(files, '*.xml') self.posTags = ADict(default=set) if dataset == 'off': pos_ex = ADict() neg_ex = ADict() equiv = ADict() else: pos_ex, neg_ex, equiv = self.collectExamples( files, dataset, common_dataset, default_dataset, pos_method, neg_method) neg_ex = self.filterNumberNegEx(neg_ex) pos_ex = self.doEquivalenceTable(pos_ex, equiv) neg_ex = self.doEquivalenceTable(neg_ex, equiv) key = lambda (k, v): (k[0], -v, k[1]) if posout is not None: pos_ex.writeToFile(posout, key=key, format='%r') if negout is not None: neg_ex.writeToFile(negout, key=key)
def separateDAs(self, dlgs): ret = ADict(default=list) for dlg in dlgs: for da_speaker, da_type, da_words in dlg: da = self.mapDAtype(da_speaker, da_type) if da is None: continue ret[da].append(da_words) return ret
def normalizedEdgesFrom(self, node): lst = ADict() for new_node, edge in self.fsm.edgesFrom(node): isym = edge[0] osym = edge[1] if len(edge) == 2: weight = 1 else: weight = edge[2] lst[new_node, isym, osym] += weight lst_sum = float(lst.sum()) lst_sum = 1 for (new_node, isym, osym), weight in lst.iteritems(): weight /= lst_sum yield new_node, isym, osym, weight
def main(self, concept_map, sym_map, examples, output, threshold): self.conceptMap = SymMap.readFromFile(concept_map, format=(int, unicode)).inverse self.symMap = SymMap.readFromFile(sym_map, format=(int, unicode)).inverse examples = ADict.readFromFile(examples) examples = self.mapExamples(examples, threshold) key = lambda (k, v): (k[0], -v, k[1]) examples.writeToFile(output, key=key)
def collectExamples(self, files, dataset, common_dataset, default_dataset, pos_method, neg_method): pos_ex = ADict() neg_ex = ADict() netyped_ex = ADict() equiv = ADict(default=set) all_stacks = set() all_words = set() for ne_type, fw, stacks in self.genNETypeWordConcept( files, [dataset], common_dataset, default_dataset): items = zip(*fw) words = items[0] for w, word in fw: all_words.add(w) equiv[w].add(word) for stack in stacks: stack = stack[1:] # Skip superroot None if stack: all_stacks.add(tuple(stack)) # Collection of positive examples if pos_method in ('basic', 'netyped'): pos_ex += self.collectPos_basic(stacks, words) if pos_method == 'netyped': netyped_ex += self.collectPos_netyped(ne_type, words) # Collection of negative examples if neg_method == 'basic': neg_ex += self.collectNeg_basic(stacks, words) if neg_method == 'derived': neg_ex += self.collectNeg_derived(all_stacks, all_words, pos_ex) elif neg_method == 'derclass': neg_ex += self.collectNeg_derclass(all_stacks, all_words, pos_ex) if pos_method == 'netyped': pos_ex = netyped_ex return pos_ex, neg_ex, equiv
def collectPos_basic(self, stacks, words): pos_ex = ADict() for stack in stacks: c = stack[-1] if c is None: # Ignore superroot of tree continue for c_ in self.otherConceptsFromGroup(c): for w in words: pos_ex[c_, w] += 1 return pos_ex
def getHitMissFA(self): if 'HitMissFA' in self._cache: return self._cache['HitMissFA'] hit = ADict() miss = ADict() fa = ADict() for a, b in self: if a == b: hit[a] += 1 elif a == None: fa[b] += 1 elif b == None: miss[a] += 1 else: miss[a] += 1 fa[b] += 1 self._cache['HitMissFA'] = hit, miss, fa return hit, miss, fa
def collectNeg_derived(self, histories, words, pos_ex): neg_ex = ADict() concepts = set() for stack in histories: concepts.add(stack[-1]) for c in concepts: for w in words: if pos_ex[c, w] == 0: neg_ex[c, w] += 1 return neg_ex
def genNETypeWordConcept(self, files, datasets, common_dataset, default_dataset): g = input.MultiReader(files, input.DXMLReader) g = input.InputGenerator(g, datasets, default_dataset) g = self.generalizer(g, common_dataset) for da in g: type_counts = ADict() ne_typed = [(ne_type, ) + (g, ) + w for ne_type, g, w in zip( da['ne_typed'], da['generalized'], da['requested'])] filtered = self.filterNEtext(ne_typed) for item in filtered: ne_type = item[0] if ne_type is not None: type_counts[ne_type.upper()] += 1 semantics = da.get('semantics', '') tree = OrderedTree.fromString(semantics) tree_counts = tree.getConceptCounts() bad_ne = set() for concept, count in type_counts.iteritems(): if tree_counts.get(concept, 0) != count: bad_ne.add(concept) ne_types = set(type_counts.keys()) - set(bad_ne) if not bad_ne: splits = tree.splitStateVector(*list(ne_types)) for (ne_type, text), states in zip(filtered, splits): yield ne_type, text, states else: # Some conflicts or no named entities states = tree.toStateVector() for (ne_type, text) in filtered: if ne_type in tree.conceptCounts: only_states = [i for i in states if ne_type in i] yield ne_type, text, only_states else: yield ne_type, text, states
def collectNeg_derclass(self, histories, words, pos_ex): neg_ex = ADict() concepts = set() for conceptGroup in DEPENDENT_CONCEPTS: concepts |= set(conceptGroup) for c in concepts: for w in words: if pos_ex[c, w] == 0: neg_ex[c, w] += 1 return neg_ex
def genNETypeWordConcept(self, files, datasets, common_dataset, default_dataset): g = input.MultiReader(files, input.DXMLReader) g = input.InputGenerator(g, datasets, default_dataset) g = self.generalizer(g, common_dataset) for da in g: type_counts = ADict() ne_typed = [(ne_type,)+(g,)+w for ne_type, g, w in zip(da['ne_typed'], da['generalized'], da['requested'])] filtered = self.filterNEtext(ne_typed) for item in filtered: ne_type = item[0] if ne_type is not None: type_counts[ne_type.upper()] += 1 semantics = da.get('semantics', '') tree = OrderedTree.fromString(semantics) tree_counts = tree.getConceptCounts() bad_ne = set() for concept, count in type_counts.iteritems(): if tree_counts.get(concept, 0) != count: bad_ne.add(concept) ne_types = set(type_counts.keys()) - set(bad_ne) if not bad_ne: splits = tree.splitStateVector(*list(ne_types)) for (ne_type, text), states in zip(filtered, splits): yield ne_type, text, states else: # Some conflicts or no named entities states = tree.toStateVector() for (ne_type, text) in filtered: if ne_type in tree.conceptCounts: only_states = [i for i in states if ne_type in i] yield ne_type, text, only_states else: yield ne_type, text, states
def mapExamples(self, dict, trsh): ret = ADict() for concept_word, count in dict.iteritems(): if len(concept_word) != 2: # TODO: REMOVE: used only with bigram_lemma continue c, w = concept_word if c not in self.conceptMap: continue if w not in self.symMap: continue if count < trsh: continue ret[c, w] += count return ret
def mapExamples(self, dict): _unseen_ = self.symMap['_unseen_'] ret = {} for concept_word, count in dict.iteritems(): if len(concept_word) != 2: # TODO: REMOVE: used only with bigram_lemma continue c, w = concept_word if c not in self.conceptMap: continue c = self.conceptMap[c] w = self.symMap.get(w, _unseen_) if c not in ret: ret[c] = ADict() ret[c][w] += count return ret
def getErrStat(self): if 'ErrStat' in self._cache: return self._cache['ErrStat'] stat = ADict() for a, b in self: if a == b: continue elif a == None: stat[b] += 1 elif b == None: stat[a] += 1 else: stat[a] += 0.5 stat[b] += 0.5 self._cache['ErrStat'] = stat return stat
def decodeOutput(self, outputs): start = None heap = ADict(default=list) for dcdline in outputs: dcdsplit = dcdline.split() if len(dcdsplit) == 1: continue elif len(dcdsplit) == 2: continue elif len(dcdsplit) == 5: s1, s2, isym, osym, weight = dcdsplit elif len(dcdsplit) == 4: s1, s2, isym, osym = dcdsplit weight = 0. else: raise ValueError("Bad output line from decoder: %r" % dcdline) s1 = int(s1) s2 = int(s2) isym = int(isym) osym = int(osym) weight = float(weight) if start is None: start = s1 heap[s1].append( (s2, isym, osym, weight) ) while heap[start]: new_state = start while new_state in heap: trans = heap[new_state].pop(0) if not heap[new_state]: del heap[new_state] old_state = new_state new_state, isym, osym, weight = trans yield isym, osym, weight yield None
def main(self, files, dataset, default_datasets, posout=None, negout=None, pos_method='basic', neg_method='basic', disable_ne=False, disable_common=False): if pos_method not in ['basic', 'netyped', 'off']: raise ValueError("Unsupported pos_method: %s" % pos_method) if neg_method not in ['basic', 'derived', 'derclass', 'off']: raise ValueError("Unsupported neg_method: %s" % neg_method) default_dataset, common_dataset = default_datasets.split(',') if disable_common: common_dataset = dataset self.disable_ne = disable_ne files = self.globFiles(files, '*.xml') self.posTags = ADict(default=set) if dataset == 'off': pos_ex = ADict() neg_ex = ADict() equiv = ADict() else: pos_ex, neg_ex, equiv = self.collectExamples(files, dataset, common_dataset, default_dataset, pos_method, neg_method) neg_ex = self.filterNumberNegEx(neg_ex) pos_ex = self.doEquivalenceTable(pos_ex, equiv) neg_ex = self.doEquivalenceTable(neg_ex, equiv) key=lambda (k,v): (k[0], -v, k[1]) if posout is not None: pos_ex.writeToFile(posout, key=key, format='%r') if negout is not None: neg_ex.writeToFile(negout, key=key)
def genStates(self): processed = set() backoff_stat = ADict(default=set) osym_map = SymMap() osym_map['epsilon'] = 0 pop_Given_C = self.workspace[gmtk.SCPT, 'popGivenC1C2C3C4'] push_Given_C = self.workspace[gmtk.SCPT, 'pushGivenC1C2C3C4'] c1_Given_C234 = self.workspace[gmtk.SCPT, 'concept1GivenC2C3C4'] c1_Given_C23 = self.workspace[gmtk.SCPT, 'concept1GivenC2C3'] c1_Given_C2 = self.workspace[gmtk.DCPT, 'concept1GivenC2'] c1_backoff = self.workspace[gmtk.DT, 'backoffC2C3C4'] c2_Given_C = self.workspace[gmtk.SCPT, 'concept2GivenC3C4'] s1_Given_C1234 = self.workspace[gmtk.SCPT, 's1GivenC1C2C3C4'] s1_Given_C123 = self.workspace[gmtk.SCPT, 's1GivenC1C2C3'] s1_Given_C12 = self.workspace[gmtk.SCPT, 's1GivenC1C2'] s1_Given_C1 = self.workspace[gmtk.DCPT, 's1GivenC1'] s1_Unigram = self.workspace[gmtk.DCPT, 's1Unigram'] s1_backoff = self.workspace[gmtk.DT, 'backoffC1C2C3C4'] s2_Given_C1234 = self.workspace[gmtk.SCPT, 's2GivenC1C2C3C4'] s2_Given_C123 = self.workspace[gmtk.SCPT, 's2GivenC1C2C3'] s2_Given_C12 = self.workspace[gmtk.SCPT, 's2GivenC1C2'] s2_Given_C1 = self.workspace[gmtk.DCPT, 's2GivenC1'] s2_Unigram = self.workspace[gmtk.DCPT, 's2Unigram'] s3_Given_C1234 = self.workspace[gmtk.SCPT, 's3GivenC1C2C3C4'] s3_Given_C123 = self.workspace[gmtk.SCPT, 's3GivenC1C2C3'] s3_Given_C12 = self.workspace[gmtk.SCPT, 's3GivenC1C2'] s3_Given_C1 = self.workspace[gmtk.DCPT, 's3GivenC1'] s3_Unigram = self.workspace[gmtk.DCPT, 's3Unigram'] conceptMap = self.conceptMap _EMPTY_ = conceptMap[EMPTY_CONCEPT] _DUMMY_ = conceptMap.get(DUMMY_CONCEPT, None) allConcepts = sorted(conceptMap.values()) symbols = [] maps = [] maps2 = [] count = 1 pte_map = SymMap() pte_map2 = SymMap() if self.pteMap: pte_symbols = sorted(self.pteMap.values()) for key, value in sorted(self.pteMap.items()): pte_map[value] = value+count pte_map2[key] = value+count count += len(pte_map) else: pte_symbols = [] for map in self.symbolMaps: if map is None: map = {} symbols.append(sorted(map.values())) new_map = SymMap() new_map2 = SymMap() for key, value in sorted(map.items()): new_map[value] = value+count new_map2[key] = value+count count += len(new_map) maps.append(new_map) maps2.append(new_map2) s0 = (_EMPTY_,)*4 s0_expanded = False cutoff_sym = self.cutoff_sym cutoff_trans = self.cutoff_trans max_states = self.max_states logger = self.logger stack = [(0, 0, s0)] stack_set = set([s0]) state_map = SymMap() state_map[s0] = 0 _pop_ = self._pop_ interim_counter = 0 n_arcs = 0 while stack: if max_states is None: total_states = len(state_map) - interim_counter else: total_states = max_states if logger is not None: logger.debug(' #states (unexpanded/total) %.2f%%, %d/%d, #arcs %d', 100.*len(processed)/total_states, total_states-len(processed), total_states, n_arcs) c_t_backoff, c_t_dist, c_t = stack.pop(0) backoff_stat[c_t_backoff].add(c_t) if logger is not None: logger.debug(' %.2f: %s, backoff=%d', c_t_dist, self.strState(c_t), c_t_backoff) state_c_t = state_map[c_t] processed.add(c_t) stack_set.remove(c_t) ret = [] pop_pmf = list(pop_Given_C[: c_t[0], c_t[1], c_t[2], c_t[3]]) push_pmf = list(push_Given_C[: c_t[0], c_t[1], c_t[2], c_t[3]]) for pop in range(0, MAX_POP+1): prob_pop = pop_pmf[pop] if prob_pop <= cutoff_trans: continue interim_counter += 1 c_inter = c_t[pop:] + (_EMPTY_, ) * pop osym = ')'*pop if not osym: osym = 'epsilon' ret.append( (prob_pop, c_t, (c_t, c_inter), _pop_, osym) ) for push in range(0, MAX_PUSH+1): prob_push = push_pmf[push] if push == 0: to_push_all = [()] else: to_push_all = cartezian(*[allConcepts]*push) for to_push in to_push_all: c_new = (to_push + c_inter)[:DEPTH] if (c_t == c_new) and not (push == pop == 0): continue if _DUMMY_ in c_new[1:]: continue # Output symbol osym = '' for push_concept in reversed(to_push): osym += conceptMap.inverse[push_concept]+'(' if not osym: osym = 'epsilon' # Smoothing backoff = c1_backoff[c_new[1], c_new[2], c_new[3]] if backoff == 0: c1_pmf = c1_Given_C234[: c_new[1], c_new[2], c_new[3]] elif backoff == 1: c1_pmf = c1_Given_C23[: c_new[1], c_new[2]] else: c1_pmf = c1_Given_C2[: c_new[1]] c2_pmf = c2_Given_C[: c_new[2], c_new[3]] if push == 0: prob_new_c = 1.0 elif push == 1: prob_new_c = c1_pmf[to_push[0]] elif push == 2: prob_new_c = c1_pmf[to_push[0]] * c2_pmf[to_push[1]] prob_trans = prob_push * prob_new_c # Do cut-off if prob_trans <= cutoff_trans: continue # Smoothing backoff = s1_backoff[c_new[0], c_new[1], c_new[2], c_new[3]] if backoff == 0: s_pmf = [list(s1_Given_C1234[: c_new[0], c_new[1], c_new[2], c_new[3]]), list(s2_Given_C1234[: c_new[0], c_new[1], c_new[2], c_new[3]]), list(s3_Given_C1234[: c_new[0], c_new[1], c_new[2], c_new[3]])] elif backoff == 1: s_pmf = [list(s1_Given_C123[: c_new[0], c_new[1], c_new[2]]), list(s2_Given_C123[: c_new[0], c_new[1], c_new[2]]), list(s3_Given_C123[: c_new[0], c_new[1], c_new[2]])] elif backoff == 2: s_pmf = [list(s1_Given_C12[: c_new[0], c_new[1]]), list(s2_Given_C12[: c_new[0], c_new[1]]), list(s3_Given_C12[: c_new[0], c_new[1]])] elif backoff == 3: s_pmf = [list(s1_Given_C1[: c_new[0]]), list(s2_Given_C1[: c_new[0]]), list(s3_Given_C1[: c_new[0]])] else: s_pmf = [list(s1_Unigram), list(s2_Unigram), list(s3_Unigram)] if c_new not in processed and c_new not in stack_set: stack_set.add(c_new) c_new_dist = (c_t_dist-log(prob_trans)) insort(stack, (backoff, c_t_dist-log(prob_trans), c_new)) c_next = (c_t, c_inter) if pte_symbols and c_inter == (_EMPTY_,)*4 and push != 0: for pte_sym in pte_symbols: prob_ptesym = 1.0 pte_sym = pte_map[pte_sym] pte_osym = pte_map2.inverse[pte_sym] ret.append( (prob_trans*prob_ptesym, c_next, c_new, pte_sym, pte_osym) ) prob_trans = 1.0 c_next = c_new for sym, map, pmf in zip(symbols, maps, s_pmf): if map is None: continue for isym in sym: prob_isym = pmf[isym] # Do cut-off if prob_isym <= cutoff_sym: continue else: isym = map[isym] ret.append( (prob_trans*prob_isym, c_next, c_new, isym, osym) ) # For symbols other than the first prob_trans = 1.0 c_next = c_new osym = 'epsilon' for prob, c_t, c_new, isym, osym in ret: state_c_new = state_map.add(c_new) state_c_t = state_map.add(c_t) osym = osym_map.add(osym) n_arcs += 1 yield state_c_t, state_c_new, isym, osym, prob if max_states is not None and len(processed) >= max_states: break self.stateMap = self.convertStateMap(state_map) self.osymMap = osym_map self.isymMaps = maps2 self.ipteMap = pte_map2 backoff_stat = ADict((k, len(v)) for (k,v) in backoff_stat.iteritems()) if logger is not None: logger.debug('Backoff statistics:') logger.debug('===================') total = backoff_stat.sum() for key, value in sorted(backoff_stat.items()): logger.debug(' backoff=%d: #%d (%.2f%%)', key, value, 100.*value/total)
def genStates(self): processed = set() backoff_stat = ADict(default=set) osym_map = SymMap() osym_map['epsilon'] = 0 pop_Given_C = self.workspace[gmtk.SCPT, 'popGivenC1C2C3C4'] push_Given_C = self.workspace[gmtk.SCPT, 'pushGivenC1C2C3C4'] c1_Given_C234 = self.workspace[gmtk.SCPT, 'concept1GivenC2C3C4'] c1_Given_C23 = self.workspace[gmtk.SCPT, 'concept1GivenC2C3'] c1_Given_C2 = self.workspace[gmtk.DCPT, 'concept1GivenC2'] c1_backoff = self.workspace[gmtk.DT, 'backoffC2C3C4'] c2_Given_C = self.workspace[gmtk.SCPT, 'concept2GivenC3C4'] s1_Given_C1234 = self.workspace[gmtk.SCPT, 's1GivenC1C2C3C4'] s1_Given_C123 = self.workspace[gmtk.SCPT, 's1GivenC1C2C3'] s1_Given_C12 = self.workspace[gmtk.SCPT, 's1GivenC1C2'] s1_Given_C1 = self.workspace[gmtk.DCPT, 's1GivenC1'] s1_Unigram = self.workspace[gmtk.DCPT, 's1Unigram'] s1_backoff = self.workspace[gmtk.DT, 'backoffC1C2C3C4'] s2_Given_C1234 = self.workspace[gmtk.SCPT, 's2GivenC1C2C3C4'] s2_Given_C123 = self.workspace[gmtk.SCPT, 's2GivenC1C2C3'] s2_Given_C12 = self.workspace[gmtk.SCPT, 's2GivenC1C2'] s2_Given_C1 = self.workspace[gmtk.DCPT, 's2GivenC1'] s2_Unigram = self.workspace[gmtk.DCPT, 's2Unigram'] s3_Given_C1234 = self.workspace[gmtk.SCPT, 's3GivenC1C2C3C4'] s3_Given_C123 = self.workspace[gmtk.SCPT, 's3GivenC1C2C3'] s3_Given_C12 = self.workspace[gmtk.SCPT, 's3GivenC1C2'] s3_Given_C1 = self.workspace[gmtk.DCPT, 's3GivenC1'] s3_Unigram = self.workspace[gmtk.DCPT, 's3Unigram'] conceptMap = self.conceptMap _EMPTY_ = conceptMap[EMPTY_CONCEPT] _DUMMY_ = conceptMap.get(DUMMY_CONCEPT, None) allConcepts = sorted(conceptMap.values()) symbols = [] maps = [] maps2 = [] count = 1 pte_map = SymMap() pte_map2 = SymMap() if self.pteMap: pte_symbols = sorted(self.pteMap.values()) for key, value in sorted(self.pteMap.items()): pte_map[value] = value + count pte_map2[key] = value + count count += len(pte_map) else: pte_symbols = [] for map in self.symbolMaps: if map is None: map = {} symbols.append(sorted(map.values())) new_map = SymMap() new_map2 = SymMap() for key, value in sorted(map.items()): new_map[value] = value + count new_map2[key] = value + count count += len(new_map) maps.append(new_map) maps2.append(new_map2) s0 = (_EMPTY_, ) * 4 s0_expanded = False cutoff_sym = self.cutoff_sym cutoff_trans = self.cutoff_trans max_states = self.max_states logger = self.logger stack = [(0, 0, s0)] stack_set = set([s0]) state_map = SymMap() state_map[s0] = 0 _pop_ = self._pop_ interim_counter = 0 n_arcs = 0 while stack: if max_states is None: total_states = len(state_map) - interim_counter else: total_states = max_states if logger is not None: logger.debug( ' #states (unexpanded/total) %.2f%%, %d/%d, #arcs %d', 100. * len(processed) / total_states, total_states - len(processed), total_states, n_arcs) c_t_backoff, c_t_dist, c_t = stack.pop(0) backoff_stat[c_t_backoff].add(c_t) if logger is not None: logger.debug(' %.2f: %s, backoff=%d', c_t_dist, self.strState(c_t), c_t_backoff) state_c_t = state_map[c_t] processed.add(c_t) stack_set.remove(c_t) ret = [] pop_pmf = list(pop_Given_C[:c_t[0], c_t[1], c_t[2], c_t[3]]) push_pmf = list(push_Given_C[:c_t[0], c_t[1], c_t[2], c_t[3]]) for pop in range(0, MAX_POP + 1): prob_pop = pop_pmf[pop] if prob_pop <= cutoff_trans: continue interim_counter += 1 c_inter = c_t[pop:] + (_EMPTY_, ) * pop osym = ')' * pop if not osym: osym = 'epsilon' ret.append((prob_pop, c_t, (c_t, c_inter), _pop_, osym)) for push in range(0, MAX_PUSH + 1): prob_push = push_pmf[push] if push == 0: to_push_all = [()] else: to_push_all = cartezian(*[allConcepts] * push) for to_push in to_push_all: c_new = (to_push + c_inter)[:DEPTH] if (c_t == c_new) and not (push == pop == 0): continue if _DUMMY_ in c_new[1:]: continue # Output symbol osym = '' for push_concept in reversed(to_push): osym += conceptMap.inverse[push_concept] + '(' if not osym: osym = 'epsilon' # Smoothing backoff = c1_backoff[c_new[1], c_new[2], c_new[3]] if backoff == 0: c1_pmf = c1_Given_C234[:c_new[1], c_new[2], c_new[3]] elif backoff == 1: c1_pmf = c1_Given_C23[:c_new[1], c_new[2]] else: c1_pmf = c1_Given_C2[:c_new[1]] c2_pmf = c2_Given_C[:c_new[2], c_new[3]] if push == 0: prob_new_c = 1.0 elif push == 1: prob_new_c = c1_pmf[to_push[0]] elif push == 2: prob_new_c = c1_pmf[to_push[0]] * c2_pmf[ to_push[1]] prob_trans = prob_push * prob_new_c # Do cut-off if prob_trans <= cutoff_trans: continue # Smoothing backoff = s1_backoff[c_new[0], c_new[1], c_new[2], c_new[3]] if backoff == 0: s_pmf = [ list(s1_Given_C1234[:c_new[0], c_new[1], c_new[2], c_new[3]]), list(s2_Given_C1234[:c_new[0], c_new[1], c_new[2], c_new[3]]), list(s3_Given_C1234[:c_new[0], c_new[1], c_new[2], c_new[3]]) ] elif backoff == 1: s_pmf = [ list(s1_Given_C123[:c_new[0], c_new[1], c_new[2]]), list(s2_Given_C123[:c_new[0], c_new[1], c_new[2]]), list(s3_Given_C123[:c_new[0], c_new[1], c_new[2]]) ] elif backoff == 2: s_pmf = [ list(s1_Given_C12[:c_new[0], c_new[1]]), list(s2_Given_C12[:c_new[0], c_new[1]]), list(s3_Given_C12[:c_new[0], c_new[1]]) ] elif backoff == 3: s_pmf = [ list(s1_Given_C1[:c_new[0]]), list(s2_Given_C1[:c_new[0]]), list(s3_Given_C1[:c_new[0]]) ] else: s_pmf = [ list(s1_Unigram), list(s2_Unigram), list(s3_Unigram) ] if c_new not in processed and c_new not in stack_set: stack_set.add(c_new) c_new_dist = (c_t_dist - log(prob_trans)) insort( stack, (backoff, c_t_dist - log(prob_trans), c_new)) c_next = (c_t, c_inter) if pte_symbols and c_inter == ( _EMPTY_, ) * 4 and push != 0: for pte_sym in pte_symbols: prob_ptesym = 1.0 pte_sym = pte_map[pte_sym] pte_osym = pte_map2.inverse[pte_sym] ret.append((prob_trans * prob_ptesym, c_next, c_new, pte_sym, pte_osym)) prob_trans = 1.0 c_next = c_new for sym, map, pmf in zip(symbols, maps, s_pmf): if map is None: continue for isym in sym: prob_isym = pmf[isym] # Do cut-off if prob_isym <= cutoff_sym: continue else: isym = map[isym] ret.append((prob_trans * prob_isym, c_next, c_new, isym, osym)) # For symbols other than the first prob_trans = 1.0 c_next = c_new osym = 'epsilon' for prob, c_t, c_new, isym, osym in ret: state_c_new = state_map.add(c_new) state_c_t = state_map.add(c_t) osym = osym_map.add(osym) n_arcs += 1 yield state_c_t, state_c_new, isym, osym, prob if max_states is not None and len(processed) >= max_states: break self.stateMap = self.convertStateMap(state_map) self.osymMap = osym_map self.isymMaps = maps2 self.ipteMap = pte_map2 backoff_stat = ADict( (k, len(v)) for (k, v) in backoff_stat.iteritems()) if logger is not None: logger.debug('Backoff statistics:') logger.debug('===================') total = backoff_stat.sum() for key, value in sorted(backoff_stat.items()): logger.debug(' backoff=%d: #%d (%.2f%%)', key, value, 100. * value / total)
def sresults(self, files, fw=sys.stdout, only=[], skip=[], twoside=False, groupby='none'): """Print HResults-like statistics of system's output """ if not only: only = None if not skip: skip = None tH = tD = tS = tI = tN = 0 uH = uN = 0 tHit = ADict() tMiss = ADict() tFA = ADict() strftime = time.strftime('%a, %d %b %Y %H:%M:%S') fw.write('----------------------- Semantics Scores --------------------------\n') for fn1, fn2 in self.genFnPairs(files, twoside): forest1, forest2 = self.loadForestFiles((fn1, fn2), only, skip) fw.write('====================== CDC Results Analysis =======================\n') fw.write(' Date: %s\n' % strftime) fw.write(' Ref : %s\n' % fn1) fw.write(' Rec : %s\n' % fn2) fw.write('-------------------------- File Results ---------------------------\n') processor = self.forestProcessor(forest1, forest2) H = N = D = I = S = 0 last_group = None for fn, tree1, tree2, dist, script in processor: new_group = self.groupMapping(fn, groupby) if last_group is None: last_group = new_group if new_group != last_group: # Doslo ke zmene skupiny, vypisu charakteristiky if N != 0: Corr = (float(H)/N)*100. Acc = (float(H-I)/N)*100. else: Corr = Acc = 0. fw.write('%s: %6.2f(%6.2f) [H=%4d, D=%3d, S=%3d, I=%3d, N=%3d]\n' % \ (last_group, Corr, Acc, H, D, S, I, N)) tH += H; tD += D; tS += S; tI += I; tN += N if S == 0 and I == 0 and D == 0: assert H == N uH += 1 uN += 1 # Vynulovani prubeznych skupinovych charakteristik H = N = D = I = S = 0 # Nastaveni nove skupiny last_group = new_group lH, lD, lI, lS = script.HDIS H += lH D += lD I += lI S += lS N += script.numConcepts[0] hit, miss, fa = script.hitMissFA tHit += hit tMiss += miss tFA += fa else: # Vypsani za posledni skupinu if N != 0: Corr = (float(H)/N)*100. Acc = (float(H-I)/N)*100. else: Corr = Acc = 0. fw.write('%s: %6.2f(%6.2f) [H=%4d, D=%3d, S=%3d, I=%3d, N=%3d]\n' % \ (last_group, Corr, Acc, H, D, S, I, N)) tH += H; tD += D; tS += S; tI += I; tN += N if S == 0 and I == 0 and D == 0: assert H == N uH += 1 uN += 1 tCorr = 100. * tH / tN tAcc = 100. * (tH - tI) / tN uS = uN - uH uCorr = 100. * uH / uN fw.write('------------------------ Concept Results --------------------------\n') allConcepts = (tHit+tMiss+tFA).keys() allResults = [] for concept in sorted(allConcepts): C = float(tHit[concept]) FA = float(tFA[concept]) M = float(tMiss[concept]) if C+FA > 0: Prec = C / (C+FA) * 100 else: Prec = 0 if C+M > 0: Recall = C / (C+M) * 100 else: Recall = 0 if Prec+Recall > 0: F = 2*Prec*Recall/(Prec+Recall) else: F = 0 allResults.append((F, concept, Prec, Recall, C, M, FA)) for F, concept,Prec, Recall, C, M, FA in sorted(allResults, reverse=True): fw.write('%-15s: F=%6.2f, P=%6.2f, R=%6.2f [C=%d, M=%d, FA=%d]\n' % (concept, F, Prec, Recall, C, M, FA)) tC = float(tHit.sum()) tM = float(tMiss.sum()) tFA = float(tFA.sum()) tPrec = tC / (tC+tFA) * 100 tRecall = tC / (tC+tM) * 100 tF = 2*tPrec*tRecall/(tPrec+tRecall) fw.write('------------------------ Overall Results --------------------------\n') fw.write('UTTR: %%Correct=%.2f [H=%d, S=%d, N=%d]\n' % (uCorr, uH, uS, uN)) fw.write('CONC: F=%.2f, P=%.2f, R=%.2f [C=%d, M=%d, FA=%d]\n' % (tF, tPrec, tRecall, tC, tM, tFA)) fw.write(' %%Corr=%.2f, Acc=%.2f [H=%d, D=%d, S=%d, I=%d, N=%d]\n' % (tCorr, tAcc, tH, tD, tS, tI, tN)) fw.write('===================================================================\n') ret = {} ret['cCorr'] = tCorr ret['cAcc'] = tAcc ret['cH'] = tH ret['cD'] = tD ret['cS'] = tS ret['cI'] = tI ret['cN'] = tN ret['uCorr'] = uCorr ret['uH'] = uH ret['uS'] = uS ret['uN'] = uN ret['Prec'] = tPrec ret['Recall'] = tRecall ret['F'] = tF return ret
def collectPos_netyped(self, ne_type, words): pos_ex = ADict() if ne_type is not None: for w in words: pos_ex[ne_type, w] += 1 return pos_ex
def doEquivalenceTable(self, ex, table): ret = ADict() for (concept, common), count in ex.iteritems(): for w in table[common]: ret[concept, w] = count return ret
def sresults(self, files, fw=sys.stdout, only=[], skip=[], twoside=False, groupby='none'): """Print HResults-like statistics of system's output """ if not only: only = None if not skip: skip = None tH = tD = tS = tI = tN = 0 uH = uN = 0 tHit = ADict() tMiss = ADict() tFA = ADict() strftime = time.strftime('%a, %d %b %Y %H:%M:%S') fw.write( '----------------------- Semantics Scores --------------------------\n' ) for fn1, fn2 in self.genFnPairs(files, twoside): forest1, forest2 = self.loadForestFiles((fn1, fn2), only, skip) fw.write( '====================== CDC Results Analysis =======================\n' ) fw.write(' Date: %s\n' % strftime) fw.write(' Ref : %s\n' % fn1) fw.write(' Rec : %s\n' % fn2) fw.write( '-------------------------- File Results ---------------------------\n' ) processor = self.forestProcessor(forest1, forest2) H = N = D = I = S = 0 last_group = None for fn, tree1, tree2, dist, script in processor: new_group = self.groupMapping(fn, groupby) if last_group is None: last_group = new_group if new_group != last_group: # Doslo ke zmene skupiny, vypisu charakteristiky if N != 0: Corr = (float(H) / N) * 100. Acc = (float(H - I) / N) * 100. else: Corr = Acc = 0. fw.write('%s: %6.2f(%6.2f) [H=%4d, D=%3d, S=%3d, I=%3d, N=%3d]\n' % \ (last_group, Corr, Acc, H, D, S, I, N)) tH += H tD += D tS += S tI += I tN += N if S == 0 and I == 0 and D == 0: assert H == N uH += 1 uN += 1 # Vynulovani prubeznych skupinovych charakteristik H = N = D = I = S = 0 # Nastaveni nove skupiny last_group = new_group lH, lD, lI, lS = script.HDIS H += lH D += lD I += lI S += lS N += script.numConcepts[0] hit, miss, fa = script.hitMissFA tHit += hit tMiss += miss tFA += fa else: # Vypsani za posledni skupinu if N != 0: Corr = (float(H) / N) * 100. Acc = (float(H - I) / N) * 100. else: Corr = Acc = 0. fw.write('%s: %6.2f(%6.2f) [H=%4d, D=%3d, S=%3d, I=%3d, N=%3d]\n' % \ (last_group, Corr, Acc, H, D, S, I, N)) tH += H tD += D tS += S tI += I tN += N if S == 0 and I == 0 and D == 0: assert H == N uH += 1 uN += 1 tCorr = 100. * tH / tN tAcc = 100. * (tH - tI) / tN uS = uN - uH uCorr = 100. * uH / uN fw.write( '------------------------ Concept Results --------------------------\n' ) allConcepts = (tHit + tMiss + tFA).keys() allResults = [] for concept in sorted(allConcepts): C = float(tHit[concept]) FA = float(tFA[concept]) M = float(tMiss[concept]) if C + FA > 0: Prec = C / (C + FA) * 100 else: Prec = 0 if C + M > 0: Recall = C / (C + M) * 100 else: Recall = 0 if Prec + Recall > 0: F = 2 * Prec * Recall / (Prec + Recall) else: F = 0 allResults.append((F, concept, Prec, Recall, C, M, FA)) for F, concept, Prec, Recall, C, M, FA in sorted(allResults, reverse=True): fw.write('%-15s: F=%6.2f, P=%6.2f, R=%6.2f [C=%d, M=%d, FA=%d]\n' % (concept, F, Prec, Recall, C, M, FA)) tC = float(tHit.sum()) tM = float(tMiss.sum()) tFA = float(tFA.sum()) tPrec = tC / (tC + tFA) * 100 tRecall = tC / (tC + tM) * 100 tF = 2 * tPrec * tRecall / (tPrec + tRecall) fw.write( '------------------------ Overall Results --------------------------\n' ) fw.write('UTTR: %%Correct=%.2f [H=%d, S=%d, N=%d]\n' % (uCorr, uH, uS, uN)) fw.write('CONC: F=%.2f, P=%.2f, R=%.2f [C=%d, M=%d, FA=%d]\n' % (tF, tPrec, tRecall, tC, tM, tFA)) fw.write( ' %%Corr=%.2f, Acc=%.2f [H=%d, D=%d, S=%d, I=%d, N=%d]\n' % (tCorr, tAcc, tH, tD, tS, tI, tN)) fw.write( '===================================================================\n' ) ret = {} ret['cCorr'] = tCorr ret['cAcc'] = tAcc ret['cH'] = tH ret['cD'] = tD ret['cS'] = tS ret['cI'] = tI ret['cN'] = tN ret['uCorr'] = uCorr ret['uH'] = uH ret['uS'] = uS ret['uN'] = uN ret['Prec'] = tPrec ret['Recall'] = tRecall ret['F'] = tF return ret