def words(self): return LazyMap(self._word_tokenize, self.text())
def approxrand(a, b, **kwargs): """ Returns an approximate significance level between two lists of independently generated test values. Approximate randomization calculates significance by randomly drawing from a sample of the possible permutations. At the limit of the number of possible permutations, the significance level is exact. The approximate significance level is the sample mean number of times the statistic of the permutated lists varies from the actual statistic of the unpermuted argument lists. :return: a tuple containing an approximate significance level, the count of the number of times the pseudo-statistic varied from the actual statistic, and the number of shuffles :rtype: tuple :param a: a list of test values :type a: list :param b: another list of independently generated test values :type b: list """ shuffles = kwargs.get('shuffles', 999) # there's no point in trying to shuffle beyond all possible permutations shuffles = \ min(shuffles, reduce(operator.mul, xrange(1, len(a) + len(b) + 1))) stat = kwargs.get('statistic', lambda lst: float(sum(lst)) / len(lst)) verbose = kwargs.get('verbose', False) if verbose: print 'shuffles: %d' % shuffles actual_stat = fabs(stat(a) - stat(b)) if verbose: print 'actual statistic: %f' % actual_stat print '-' * 60 c = 1e-100 lst = LazyConcatenation([a, b]) indices = range(len(a) + len(b)) for i in range(shuffles): if verbose and i % 10 == 0: print 'shuffle: %d' % i shuffle(indices) pseudo_stat_a = stat(LazyMap(lambda i: lst[i], indices[:len(a)])) pseudo_stat_b = stat(LazyMap(lambda i: lst[i], indices[len(a):])) pseudo_stat = fabs(pseudo_stat_a - pseudo_stat_b) if pseudo_stat >= actual_stat: c += 1 if verbose and i % 10 == 0: print 'pseudo-statistic: %f' % pseudo_stat print 'significance: %f' % (float(c + 1) / (i + 1)) print '-' * 60 significance = float(c + 1) / (shuffles + 1) if verbose: print 'significance: %f' % significance if betai: for phi in [0.01, 0.05, 0.10, 0.15, 0.25, 0.50]: print "prob(phi<=%f): %f" % (phi, betai(c, shuffles, phi)) return (significance, c, shuffles)
def sents(self, fileids=None): self._require(self.WORDS) return LazyMap(self._get_words, self._grids(fileids))
def _get_words(self, fileid, sent, stem, relation, pos, strip_space, replace): xmldoc = ElementTree.parse(fileid).getroot() # processing each xml doc results = [] for xmlsent in xmldoc.findall('.//{%s}u' % NS): sents = [xmlsent.get('uID'), xmlsent.get('who')] #print(xmlsent.get('uID'), xmlsent.get('who')) # select speakers #if speaker == 'ALL' or xmlsent.get('who') in speaker: for xmlword in xmlsent.findall('.//{%s}w' % NS): infl = None suffixStem = None suffixTag = None # getting replaced words if replace and xmlsent.find('.//{%s}w/{%s}replacement' % (NS, NS)): xmlword = xmlsent.find('.//{%s}w/{%s}replacement/{%s}w' % (NS, NS, NS)) elif replace and xmlsent.find('.//{%s}w/{%s}wk' % (NS, NS)): xmlword = xmlsent.find('.//{%s}w/{%s}wk' % (NS, NS)) # get text if xmlword.text: word = xmlword.text else: word = '' # strip tailing space if strip_space: word = word.strip() # stem ''' if relation or stem: try: xmlstem = xmlword.find('.//{%s}stem' % NS) word = xmlstem.text except AttributeError as e: pass # if there is an inflection try: xmlinfl = xmlword.find('.//{%s}mor/{%s}mw/{%s}mk' % (NS,NS,NS)) word += '-' + xmlinfl.text except: pass # if there is a suffix try: xmlsuffix = xmlword.find('.//{%s}mor/{%s}mor-post/{%s}mw/{%s}stem' % (NS,NS,NS,NS)) suffixStem = xmlsuffix.text except AttributeError: suffixStem = "" if suffixStem: word += "~"+suffixStem # pos''' if relation or pos: try: xmlpos = xmlword.findall(".//{%s}c" % NS) xmlpos2 = xmlword.findall(".//{%s}s" % NS) if xmlpos2 != []: tag = xmlpos[0].text + ":" + xmlpos2[0].text else: tag = xmlpos[0].text except (AttributeError, IndexError) as e: tag = "" try: xmlsuffixpos = xmlword.findall( './/{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}c' % (NS, NS, NS, NS, NS)) xmlsuffixpos2 = xmlword.findall( './/{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}s' % (NS, NS, NS, NS, NS)) if xmlsuffixpos2: suffixTag = xmlsuffixpos[ 0].text + ":" + xmlsuffixpos2[0].text else: suffixTag = xmlsuffixpos[0].text except: pass if suffixTag: tag += "~" + suffixTag word = (word, tag) # relational # the gold standard is stored in # <mor></mor><mor type="trn"><gra type="grt"> if relation == True: for xmlstem_rel in xmlword.findall('.//{%s}mor/{%s}gra' % (NS, NS)): if not xmlstem_rel.get('type') == 'grt': word = (word[0], word[1], xmlstem_rel.get('index') + "|" + xmlstem_rel.get('head') + "|" + xmlstem_rel.get('relation')) else: word = (word[0], word[1], word[2], word[0], word[1], xmlstem_rel.get('index') + "|" + xmlstem_rel.get('head') + "|" + xmlstem_rel.get('relation')) try: for xmlpost_rel in xmlword.findall( './/{%s}mor/{%s}mor-post/{%s}gra' % (NS, NS, NS)): if not xmlpost_rel.get('type') == 'grt': suffixStem = (suffixStem[0], suffixStem[1], xmlpost_rel.get('index') + "|" + xmlpost_rel.get('head') + "|" + xmlpost_rel.get('relation')) else: suffixStem = (suffixStem[0], suffixStem[1], suffixStem[2], suffixStem[0], suffixStem[1], xmlpost_rel.get('index') + "|" + xmlpost_rel.get('head') + "|" + xmlpost_rel.get('relation')) except: pass sents.append(word) #print(sents) if sent or relation: results.append(sents) else: results.extend(sents) return LazyMap(lambda x: x, results)
def sents(self, fileids=None): return LazyMap(untagged, self.elements(fileids))
def tagged_words(self, fileids=None, tagset=None): self._require(self.WORDS, self.POS) def get_tagged_words(grid): return self._get_tagged_words(grid, tagset) return LazyConcatenation(LazyMap(get_tagged_words, self._grids(fileids)))
def sents(self): # LazyMap from nltk.util: return LazyMap(lambda t: t.leaves(), self.get_trees())
print("Running on dev") test_data = [json.loads(line) for line in open_file("twt.dev.json")] else: print("Running on test") test_data = [json.loads(line) for line in open_file("twt.test.json")] test_data = handle_lowfreq_words(vocab)(test_data) twitter_model = hmm.HiddenMarkovModelTagger(symbols=hmm_model.symbols, states=tagset, transitions=transition_model, outputs=emission_model, priors=init_model) # Compute the accuracy - we can call this, but then we just do extra decoding # work. What we really need is just call nltk.metrics.accuracy on the gold and # predicted. # twitter_model.test( test_data ) # Compute the confusion matrix, technically we would be doing this twice, as # when computing accuracy we would've already done this. It would be more # optimal to modify the hmm library. But meh. gold = tag_list(test_data) unlabeled_data = LazyMap(unlabeled_words, test_data) predicted_labels = list(LazyMap(twitter_model.tag, unlabeled_data)) predicted = tag_list(predicted_labels) acc = accuracy(gold, predicted) print("Accuracy: ", acc) cm = ConfusionMatrix(gold, predicted) print(cm.pretty_format(sort_by_count=True, show_percents=True, truncate=25))
def transform(labeled_symbols): return LazyMap(relabel, labeled_symbols)
def tagged_words(self): return LazyConcatenation(LazyMap(self._tagger.tag, self.sents()))
def chunked_sents(self): return LazyMap(self._chunker.chunk, self.tagged_sents())
def tagged_sents(self): return LazyMap(self._tagger.tag, self.sents())
def zipzip(*lists): return LazyMap(lambda lst: zip(*lst), LazyZip(*lists))
def parsed_sents(self): return LazyMap(self._chunker.parse, self.tagged_sents())
def tagged_sents(self, fileids=None): def f(s): return [(w, simple_tag(t)) for w, t in s] return LazyMap(f, super().tagged_sents(fileids))
def tagged_sents(self, fileids=None): self._require(self.WORDS, self.POS) return LazyMap(self._get_tagged_words, self._grids(fileids))
def tagged_sents(self): # LazyMap from nltk.util: f = lambda t: [(x,x) for x in t.leaves()] return LazyMap(f, self.get_trees())
os.remove(test_tmp) sys.exit(0) if options.trainer: if options.pos: reader = MXPostTaggerCorpusReader(eval(options.corpus)) iob_sents = reader.iob_sents() tagged_sents = reader.tagged_sents() corpus = LazyMap(lambda (iob_sent, tagged_sent): [(iw, tt, iob) for ((iw, iob), (tw, tt)) in zip(iob_sent, tagged_sent)], LazyZip(iob_sents, tagged_sents)) else: iob_sents = eval(options.corpus).iob_sents() corpus = LazyMap(lambda iob_sent: [(w, None, i) for w, i in iob_sent], iob_sents) num_train, num_test = options.numsents num_train = num_train or int(len(corpus) * 0.9) num_test = num_test or (len(corpus) - num_train) train = corpus[:num_train] test = corpus[num_train:num_train + num_test] trainer = eval(options.trainer) if options.verbose: print 'Training %s with %d sentences' % \ (options.trainer, num_train) ner = trainer(train, feature_detector=NERChunkTaggerFeatureDetector, chunk_types=_NE_CHUNK_TYPES, verbose=options.verbose)
def parsed_sents(self, fileids=None, pos_in_tree=None, tagset=None): self._require(self.WORDS, self.POS, self.TREE) if pos_in_tree is None: pos_in_tree = self._pos_in_tree def get_parsed_sent(grid): # capture pos_in_tree as local var return self._get_parsed_sent(grid, pos_in_tree, tagset) return LazyMap(get_parsed_sent, self._grids(fileids))
def iterate_from(self, start): f = lambda d: d.get(self.field, '') return iter( LazyMap(f, self.collection.find(fields=[self.field], skip=start)))
def tagged_sents(self): # LazyMap from nltk.util: return LazyMap(lambda t: t.pos(), self.get_trees())
def words(self): return LazyConcatenation(LazyMap(self._word_tokenize, self.text()))
def parsed_sents(self, fileids=None): return LazyMap(parsed, self.elements(fileids))
def sents(self): return LazyConcatenation(LazyMap(self._sent_tokenize, self.text()))
def tagged_lem_sents(self, fileids=None, tagset=None): self._require(self.WORDS, self.POS, self.LEMMA) def get_tagged_lemmas(grid): return self._get_tagged_lemmas(grid, tagset) return LazyMap(get_tagged_lemmas, self._grids(fileids))
def dep_srl_spans(self, fileids=None): self._require(self.SRL, self.FILLPRED) return LazyMap(self._get_dep_srl_spans, self._grids(fileids))
def words(self, fileids=None): self._require(self.WORDS) return LazyConcatenation(LazyMap(self._get_words, self._grids(fileids)))
def _get_words( self, fileid, speaker, sent, stem, relation, pos, strip_space, replace ): if ( isinstance(speaker, str) and speaker != "ALL" ): # ensure we have a list of speakers speaker = [speaker] xmldoc = ElementTree.parse(fileid).getroot() # processing each xml doc results = [] for xmlsent in xmldoc.findall(".//{%s}u" % NS): sents = [] # select speakers if speaker == "ALL" or xmlsent.get("who") in speaker: for xmlword in xmlsent.findall(".//{%s}w" % NS): infl = None suffixStem = None suffixTag = None # getting replaced words if replace and xmlsent.find(".//{%s}w/{%s}replacement" % (NS, NS)): xmlword = xmlsent.find( ".//{%s}w/{%s}replacement/{%s}w" % (NS, NS, NS) ) elif replace and xmlsent.find(".//{%s}w/{%s}wk" % (NS, NS)): xmlword = xmlsent.find(".//{%s}w/{%s}wk" % (NS, NS)) # get text if xmlword.text: word = xmlword.text else: word = "" # strip tailing space if strip_space: word = word.strip() # stem if relation or stem: try: xmlstem = xmlword.find(".//{%s}stem" % NS) word = xmlstem.text except AttributeError as e: pass # if there is an inflection try: xmlinfl = xmlword.find( ".//{%s}mor/{%s}mw/{%s}mk" % (NS, NS, NS) ) word += "-" + xmlinfl.text except: pass # if there is a suffix try: xmlsuffix = xmlword.find( ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}stem" % (NS, NS, NS, NS) ) suffixStem = xmlsuffix.text except AttributeError: suffixStem = "" if suffixStem: word += "~" + suffixStem # pos if relation or pos: try: xmlpos = xmlword.findall(".//{%s}c" % NS) xmlpos2 = xmlword.findall(".//{%s}s" % NS) if xmlpos2 != []: tag = xmlpos[0].text + ":" + xmlpos2[0].text else: tag = xmlpos[0].text except (AttributeError, IndexError) as e: tag = "" try: xmlsuffixpos = xmlword.findall( ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}c" % (NS, NS, NS, NS, NS) ) xmlsuffixpos2 = xmlword.findall( ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}s" % (NS, NS, NS, NS, NS) ) if xmlsuffixpos2: suffixTag = ( xmlsuffixpos[0].text + ":" + xmlsuffixpos2[0].text ) else: suffixTag = xmlsuffixpos[0].text except: pass if suffixTag: tag += "~" + suffixTag word = (word, tag) # relational # the gold standard is stored in # <mor></mor><mor type="trn"><gra type="grt"> if relation == True: for xmlstem_rel in xmlword.findall( ".//{%s}mor/{%s}gra" % (NS, NS) ): if not xmlstem_rel.get("type") == "grt": word = ( word[0], word[1], xmlstem_rel.get("index") + "|" + xmlstem_rel.get("head") + "|" + xmlstem_rel.get("relation"), ) else: word = ( word[0], word[1], word[2], word[0], word[1], xmlstem_rel.get("index") + "|" + xmlstem_rel.get("head") + "|" + xmlstem_rel.get("relation"), ) try: for xmlpost_rel in xmlword.findall( ".//{%s}mor/{%s}mor-post/{%s}gra" % (NS, NS, NS) ): if not xmlpost_rel.get("type") == "grt": suffixStem = ( suffixStem[0], suffixStem[1], xmlpost_rel.get("index") + "|" + xmlpost_rel.get("head") + "|" + xmlpost_rel.get("relation"), ) else: suffixStem = ( suffixStem[0], suffixStem[1], suffixStem[2], suffixStem[0], suffixStem[1], xmlpost_rel.get("index") + "|" + xmlpost_rel.get("head") + "|" + xmlpost_rel.get("relation"), ) except: pass sents.append(word) if sent or relation: results.append(sents) else: results.extend(sents) return LazyMap(lambda x: x, results)
def srl_spans(self, fileids=None): self._require(self.SRL) return LazyMap(self._get_srl_spans, self._grids(fileids))
def tagged_words(self, fileids=None, simplify_tags=False): self._require(self.WORDS, self.POS) def get_tagged_words(grid): return self._get_tagged_words(grid, simplify_tags) return LazyConcatenation(LazyMap(get_tagged_words, self._grids(fileids)))