Exemple #1
0
 def words(self):
   return LazyMap(self._word_tokenize, self.text())
Exemple #2
0
def approxrand(a, b, **kwargs):
    """
    Returns an approximate significance level between two lists of
    independently generated test values.

    Approximate randomization calculates significance by randomly drawing
    from a sample of the possible permutations. At the limit of the number
    of possible permutations, the significance level is exact. The
    approximate significance level is the sample mean number of times the
    statistic of the permutated lists varies from the actual statistic of
    the unpermuted argument lists.

    :return: a tuple containing an approximate significance level, the count
             of the number of times the pseudo-statistic varied from the
             actual statistic, and the number of shuffles
    :rtype: tuple
    :param a: a list of test values
    :type a: list
    :param b: another list of independently generated test values
    :type b: list
    """
    shuffles = kwargs.get('shuffles', 999)
    # there's no point in trying to shuffle beyond all possible permutations
    shuffles = \
        min(shuffles, reduce(operator.mul, xrange(1, len(a) + len(b) + 1)))
    stat = kwargs.get('statistic', lambda lst: float(sum(lst)) / len(lst))
    verbose = kwargs.get('verbose', False)

    if verbose:
        print 'shuffles: %d' % shuffles

    actual_stat = fabs(stat(a) - stat(b))

    if verbose:
        print 'actual statistic: %f' % actual_stat
        print '-' * 60

    c = 1e-100
    lst = LazyConcatenation([a, b])
    indices = range(len(a) + len(b))

    for i in range(shuffles):
        if verbose and i % 10 == 0:
            print 'shuffle: %d' % i

        shuffle(indices)

        pseudo_stat_a = stat(LazyMap(lambda i: lst[i], indices[:len(a)]))
        pseudo_stat_b = stat(LazyMap(lambda i: lst[i], indices[len(a):]))
        pseudo_stat = fabs(pseudo_stat_a - pseudo_stat_b)

        if pseudo_stat >= actual_stat:
            c += 1

        if verbose and i % 10 == 0:
            print 'pseudo-statistic: %f' % pseudo_stat
            print 'significance: %f' % (float(c + 1) / (i + 1))
            print '-' * 60

    significance = float(c + 1) / (shuffles + 1)

    if verbose:
        print 'significance: %f' % significance
        if betai:
            for phi in [0.01, 0.05, 0.10, 0.15, 0.25, 0.50]:
                print "prob(phi<=%f): %f" % (phi, betai(c, shuffles, phi))

    return (significance, c, shuffles)
Exemple #3
0
 def sents(self, fileids=None):
     self._require(self.WORDS)
     return LazyMap(self._get_words, self._grids(fileids))
Exemple #4
0
 def _get_words(self, fileid, sent, stem, relation, pos, strip_space,
                replace):
     xmldoc = ElementTree.parse(fileid).getroot()
     # processing each xml doc
     results = []
     for xmlsent in xmldoc.findall('.//{%s}u' % NS):
         sents = [xmlsent.get('uID'), xmlsent.get('who')]
         #print(xmlsent.get('uID'), xmlsent.get('who'))
         # select speakers
         #if speaker == 'ALL' or xmlsent.get('who') in speaker:
         for xmlword in xmlsent.findall('.//{%s}w' % NS):
             infl = None
             suffixStem = None
             suffixTag = None
             # getting replaced words
             if replace and xmlsent.find('.//{%s}w/{%s}replacement' %
                                         (NS, NS)):
                 xmlword = xmlsent.find('.//{%s}w/{%s}replacement/{%s}w' %
                                        (NS, NS, NS))
             elif replace and xmlsent.find('.//{%s}w/{%s}wk' % (NS, NS)):
                 xmlword = xmlsent.find('.//{%s}w/{%s}wk' % (NS, NS))
                 # get text
             if xmlword.text:
                 word = xmlword.text
             else:
                 word = ''
                 # strip tailing space
             if strip_space:
                 word = word.strip()
                 # stem
             '''
             if relation or stem:
                 try:
                     xmlstem = xmlword.find('.//{%s}stem' % NS)
                     word = xmlstem.text
                 except AttributeError as e:
                     pass
                     # if there is an inflection
                 try:
                     xmlinfl = xmlword.find('.//{%s}mor/{%s}mw/{%s}mk'
                                                % (NS,NS,NS))
                     word += '-' + xmlinfl.text
                 except:
                     pass
                     # if there is a suffix
                 try:
                     xmlsuffix = xmlword.find('.//{%s}mor/{%s}mor-post/{%s}mw/{%s}stem'
                                                  % (NS,NS,NS,NS))
                     suffixStem = xmlsuffix.text
                 except AttributeError:
                     suffixStem = ""
                 if suffixStem:
                     word += "~"+suffixStem
                 # pos'''
             if relation or pos:
                 try:
                     xmlpos = xmlword.findall(".//{%s}c" % NS)
                     xmlpos2 = xmlword.findall(".//{%s}s" % NS)
                     if xmlpos2 != []:
                         tag = xmlpos[0].text + ":" + xmlpos2[0].text
                     else:
                         tag = xmlpos[0].text
                 except (AttributeError, IndexError) as e:
                     tag = ""
                 try:
                     xmlsuffixpos = xmlword.findall(
                         './/{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}c' %
                         (NS, NS, NS, NS, NS))
                     xmlsuffixpos2 = xmlword.findall(
                         './/{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}s' %
                         (NS, NS, NS, NS, NS))
                     if xmlsuffixpos2:
                         suffixTag = xmlsuffixpos[
                             0].text + ":" + xmlsuffixpos2[0].text
                     else:
                         suffixTag = xmlsuffixpos[0].text
                 except:
                     pass
                 if suffixTag:
                     tag += "~" + suffixTag
                 word = (word, tag)
                 # relational
                 # the gold standard is stored in
                 # <mor></mor><mor type="trn"><gra type="grt">
             if relation == True:
                 for xmlstem_rel in xmlword.findall('.//{%s}mor/{%s}gra' %
                                                    (NS, NS)):
                     if not xmlstem_rel.get('type') == 'grt':
                         word = (word[0], word[1],
                                 xmlstem_rel.get('index') + "|" +
                                 xmlstem_rel.get('head') + "|" +
                                 xmlstem_rel.get('relation'))
                     else:
                         word = (word[0], word[1], word[2], word[0],
                                 word[1], xmlstem_rel.get('index') + "|" +
                                 xmlstem_rel.get('head') + "|" +
                                 xmlstem_rel.get('relation'))
                 try:
                     for xmlpost_rel in xmlword.findall(
                             './/{%s}mor/{%s}mor-post/{%s}gra' %
                         (NS, NS, NS)):
                         if not xmlpost_rel.get('type') == 'grt':
                             suffixStem = (suffixStem[0], suffixStem[1],
                                           xmlpost_rel.get('index') + "|" +
                                           xmlpost_rel.get('head') + "|" +
                                           xmlpost_rel.get('relation'))
                         else:
                             suffixStem = (suffixStem[0], suffixStem[1],
                                           suffixStem[2], suffixStem[0],
                                           suffixStem[1],
                                           xmlpost_rel.get('index') + "|" +
                                           xmlpost_rel.get('head') + "|" +
                                           xmlpost_rel.get('relation'))
                 except:
                     pass
             sents.append(word)
             #print(sents)
         if sent or relation:
             results.append(sents)
         else:
             results.extend(sents)
     return LazyMap(lambda x: x, results)
Exemple #5
0
 def sents(self, fileids=None):
     return LazyMap(untagged, self.elements(fileids))
Exemple #6
0
 def tagged_words(self, fileids=None, tagset=None):
     self._require(self.WORDS, self.POS)
     def get_tagged_words(grid):
         return self._get_tagged_words(grid, tagset)
     return LazyConcatenation(LazyMap(get_tagged_words,
                                      self._grids(fileids)))
Exemple #7
0
 def sents(self):
     # LazyMap from nltk.util:
     return LazyMap(lambda t: t.leaves(), self.get_trees())
Exemple #8
0
        print("Running on dev")
        test_data = [json.loads(line) for line in open_file("twt.dev.json")]
    else:
        print("Running on test")
        test_data = [json.loads(line) for line in open_file("twt.test.json")]
    test_data = handle_lowfreq_words(vocab)(test_data)
    twitter_model = hmm.HiddenMarkovModelTagger(symbols=hmm_model.symbols,
                                                states=tagset,
                                                transitions=transition_model,
                                                outputs=emission_model,
                                                priors=init_model)

    # Compute the accuracy - we can call this, but then we just do extra decoding
    # work. What we really need is just call nltk.metrics.accuracy on the gold and
    # predicted.
    # twitter_model.test( test_data )

    # Compute the confusion matrix, technically we would be doing this twice, as
    # when computing accuracy we would've already done this. It would be more
    # optimal to modify the hmm library. But meh.
    gold = tag_list(test_data)
    unlabeled_data = LazyMap(unlabeled_words, test_data)
    predicted_labels = list(LazyMap(twitter_model.tag, unlabeled_data))
    predicted = tag_list(predicted_labels)

    acc = accuracy(gold, predicted)
    print("Accuracy: ", acc)
    cm = ConfusionMatrix(gold, predicted)
    print(cm.pretty_format(sort_by_count=True, show_percents=True,
                           truncate=25))
Exemple #9
0
 def transform(labeled_symbols):
     return LazyMap(relabel, labeled_symbols)
Exemple #10
0
 def tagged_words(self):
     return LazyConcatenation(LazyMap(self._tagger.tag, self.sents()))
Exemple #11
0
 def chunked_sents(self):
     return LazyMap(self._chunker.chunk, self.tagged_sents())
Exemple #12
0
 def tagged_sents(self):
     return LazyMap(self._tagger.tag, self.sents())
Exemple #13
0
def zipzip(*lists):
    return LazyMap(lambda lst: zip(*lst), LazyZip(*lists))
Exemple #14
0
 def parsed_sents(self):
     return LazyMap(self._chunker.parse, self.tagged_sents())
Exemple #15
0
    def tagged_sents(self, fileids=None):
        def f(s):
            return [(w, simple_tag(t)) for w, t in s]

        return LazyMap(f, super().tagged_sents(fileids))
Exemple #16
0
 def tagged_sents(self, fileids=None):
     self._require(self.WORDS, self.POS)
     return LazyMap(self._get_tagged_words, self._grids(fileids))
Exemple #17
0
 def tagged_sents(self):
     # LazyMap from nltk.util:
     f = lambda t: [(x,x) for x in t.leaves()]
     return LazyMap(f,  self.get_trees())
Exemple #18
0
        os.remove(test_tmp)     
        
        sys.exit(0)                

    if options.trainer:
        if options.pos:
            reader = MXPostTaggerCorpusReader(eval(options.corpus))
            iob_sents = reader.iob_sents()
            tagged_sents = reader.tagged_sents()
            corpus = LazyMap(lambda (iob_sent, tagged_sent): 
                    [(iw, tt, iob) for ((iw, iob), (tw, tt))
                     in zip(iob_sent, tagged_sent)], 
                 LazyZip(iob_sents, tagged_sents))
        else:
            iob_sents = eval(options.corpus).iob_sents()
            corpus = LazyMap(lambda iob_sent:
                [(w, None, i) for w, i in iob_sent], iob_sents)

        num_train, num_test = options.numsents
        num_train = num_train or int(len(corpus) * 0.9)
        num_test = num_test or (len(corpus) - num_train)
        train = corpus[:num_train]
        test = corpus[num_train:num_train + num_test]

        trainer = eval(options.trainer)        
        if options.verbose:
            print 'Training %s with %d sentences' % \
                (options.trainer, num_train)
        ner = trainer(train, 
            feature_detector=NERChunkTaggerFeatureDetector,
            chunk_types=_NE_CHUNK_TYPES,
            verbose=options.verbose)
Exemple #19
0
 def parsed_sents(self, fileids=None, pos_in_tree=None, tagset=None):
     self._require(self.WORDS, self.POS, self.TREE)
     if pos_in_tree is None: pos_in_tree = self._pos_in_tree
     def get_parsed_sent(grid): # capture pos_in_tree as local var
         return self._get_parsed_sent(grid, pos_in_tree, tagset)
     return LazyMap(get_parsed_sent, self._grids(fileids))
Exemple #20
0
 def iterate_from(self, start):
     f = lambda d: d.get(self.field, '')
     return iter(
         LazyMap(f, self.collection.find(fields=[self.field], skip=start)))
Exemple #21
0
 def tagged_sents(self):
     # LazyMap from nltk.util:
     return LazyMap(lambda t: t.pos(), self.get_trees())
Exemple #22
0
 def words(self):
     return LazyConcatenation(LazyMap(self._word_tokenize, self.text()))
Exemple #23
0
 def parsed_sents(self, fileids=None):
     return LazyMap(parsed, self.elements(fileids))
Exemple #24
0
 def sents(self):
     return LazyConcatenation(LazyMap(self._sent_tokenize, self.text()))
    def tagged_lem_sents(self, fileids=None, tagset=None):
        self._require(self.WORDS, self.POS, self.LEMMA)

        def get_tagged_lemmas(grid):
            return self._get_tagged_lemmas(grid, tagset)
        return LazyMap(get_tagged_lemmas, self._grids(fileids))
Exemple #26
0
 def dep_srl_spans(self, fileids=None):
     self._require(self.SRL, self.FILLPRED)
     return LazyMap(self._get_dep_srl_spans, self._grids(fileids))
Exemple #27
0
 def words(self, fileids=None):
     self._require(self.WORDS)
     return LazyConcatenation(LazyMap(self._get_words,
                                      self._grids(fileids)))
Exemple #28
0
 def _get_words(
     self, fileid, speaker, sent, stem, relation, pos, strip_space, replace
 ):
     if (
         isinstance(speaker, str) and speaker != "ALL"
     ):  # ensure we have a list of speakers
         speaker = [speaker]
     xmldoc = ElementTree.parse(fileid).getroot()
     # processing each xml doc
     results = []
     for xmlsent in xmldoc.findall(".//{%s}u" % NS):
         sents = []
         # select speakers
         if speaker == "ALL" or xmlsent.get("who") in speaker:
             for xmlword in xmlsent.findall(".//{%s}w" % NS):
                 infl = None
                 suffixStem = None
                 suffixTag = None
                 # getting replaced words
                 if replace and xmlsent.find(".//{%s}w/{%s}replacement" % (NS, NS)):
                     xmlword = xmlsent.find(
                         ".//{%s}w/{%s}replacement/{%s}w" % (NS, NS, NS)
                     )
                 elif replace and xmlsent.find(".//{%s}w/{%s}wk" % (NS, NS)):
                     xmlword = xmlsent.find(".//{%s}w/{%s}wk" % (NS, NS))
                 # get text
                 if xmlword.text:
                     word = xmlword.text
                 else:
                     word = ""
                 # strip tailing space
                 if strip_space:
                     word = word.strip()
                 # stem
                 if relation or stem:
                     try:
                         xmlstem = xmlword.find(".//{%s}stem" % NS)
                         word = xmlstem.text
                     except AttributeError as e:
                         pass
                     # if there is an inflection
                     try:
                         xmlinfl = xmlword.find(
                             ".//{%s}mor/{%s}mw/{%s}mk" % (NS, NS, NS)
                         )
                         word += "-" + xmlinfl.text
                     except:
                         pass
                     # if there is a suffix
                     try:
                         xmlsuffix = xmlword.find(
                             ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}stem"
                             % (NS, NS, NS, NS)
                         )
                         suffixStem = xmlsuffix.text
                     except AttributeError:
                         suffixStem = ""
                     if suffixStem:
                         word += "~" + suffixStem
                 # pos
                 if relation or pos:
                     try:
                         xmlpos = xmlword.findall(".//{%s}c" % NS)
                         xmlpos2 = xmlword.findall(".//{%s}s" % NS)
                         if xmlpos2 != []:
                             tag = xmlpos[0].text + ":" + xmlpos2[0].text
                         else:
                             tag = xmlpos[0].text
                     except (AttributeError, IndexError) as e:
                         tag = ""
                     try:
                         xmlsuffixpos = xmlword.findall(
                             ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}c"
                             % (NS, NS, NS, NS, NS)
                         )
                         xmlsuffixpos2 = xmlword.findall(
                             ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}s"
                             % (NS, NS, NS, NS, NS)
                         )
                         if xmlsuffixpos2:
                             suffixTag = (
                                 xmlsuffixpos[0].text + ":" + xmlsuffixpos2[0].text
                             )
                         else:
                             suffixTag = xmlsuffixpos[0].text
                     except:
                         pass
                     if suffixTag:
                         tag += "~" + suffixTag
                     word = (word, tag)
                 # relational
                 # the gold standard is stored in
                 # <mor></mor><mor type="trn"><gra type="grt">
                 if relation == True:
                     for xmlstem_rel in xmlword.findall(
                         ".//{%s}mor/{%s}gra" % (NS, NS)
                     ):
                         if not xmlstem_rel.get("type") == "grt":
                             word = (
                                 word[0],
                                 word[1],
                                 xmlstem_rel.get("index")
                                 + "|"
                                 + xmlstem_rel.get("head")
                                 + "|"
                                 + xmlstem_rel.get("relation"),
                             )
                         else:
                             word = (
                                 word[0],
                                 word[1],
                                 word[2],
                                 word[0],
                                 word[1],
                                 xmlstem_rel.get("index")
                                 + "|"
                                 + xmlstem_rel.get("head")
                                 + "|"
                                 + xmlstem_rel.get("relation"),
                             )
                     try:
                         for xmlpost_rel in xmlword.findall(
                             ".//{%s}mor/{%s}mor-post/{%s}gra" % (NS, NS, NS)
                         ):
                             if not xmlpost_rel.get("type") == "grt":
                                 suffixStem = (
                                     suffixStem[0],
                                     suffixStem[1],
                                     xmlpost_rel.get("index")
                                     + "|"
                                     + xmlpost_rel.get("head")
                                     + "|"
                                     + xmlpost_rel.get("relation"),
                                 )
                             else:
                                 suffixStem = (
                                     suffixStem[0],
                                     suffixStem[1],
                                     suffixStem[2],
                                     suffixStem[0],
                                     suffixStem[1],
                                     xmlpost_rel.get("index")
                                     + "|"
                                     + xmlpost_rel.get("head")
                                     + "|"
                                     + xmlpost_rel.get("relation"),
                                 )
                     except:
                         pass
                 sents.append(word)
             if sent or relation:
                 results.append(sents)
             else:
                 results.extend(sents)
     return LazyMap(lambda x: x, results)
Exemple #29
0
 def srl_spans(self, fileids=None):
     self._require(self.SRL)
     return LazyMap(self._get_srl_spans, self._grids(fileids))
Exemple #30
0
 def tagged_words(self, fileids=None, simplify_tags=False):
     self._require(self.WORDS, self.POS)
     def get_tagged_words(grid):
         return self._get_tagged_words(grid, simplify_tags)
     return LazyConcatenation(LazyMap(get_tagged_words,
                                      self._grids(fileids)))