Python LazyMap Exemples, nltk.util.LazyMap Python Exemples

Exemple #1

0

Afficher le fichier

 def words(self):
   return LazyMap(self._word_tokenize, self.text())

Exemple #2

0

Afficher le fichier

def approxrand(a, b, **kwargs):
    """
    Returns an approximate significance level between two lists of
    independently generated test values.

    Approximate randomization calculates significance by randomly drawing
    from a sample of the possible permutations. At the limit of the number
    of possible permutations, the significance level is exact. The
    approximate significance level is the sample mean number of times the
    statistic of the permutated lists varies from the actual statistic of
    the unpermuted argument lists.

    :return: a tuple containing an approximate significance level, the count
             of the number of times the pseudo-statistic varied from the
             actual statistic, and the number of shuffles
    :rtype: tuple
    :param a: a list of test values
    :type a: list
    :param b: another list of independently generated test values
    :type b: list
    """
    shuffles = kwargs.get('shuffles', 999)
    # there's no point in trying to shuffle beyond all possible permutations
    shuffles = \
        min(shuffles, reduce(operator.mul, xrange(1, len(a) + len(b) + 1)))
    stat = kwargs.get('statistic', lambda lst: float(sum(lst)) / len(lst))
    verbose = kwargs.get('verbose', False)

    if verbose:
        print 'shuffles: %d' % shuffles

    actual_stat = fabs(stat(a) - stat(b))

    if verbose:
        print 'actual statistic: %f' % actual_stat
        print '-' * 60

    c = 1e-100
    lst = LazyConcatenation([a, b])
    indices = range(len(a) + len(b))

    for i in range(shuffles):
        if verbose and i % 10 == 0:
            print 'shuffle: %d' % i

        shuffle(indices)

        pseudo_stat_a = stat(LazyMap(lambda i: lst[i], indices[:len(a)]))
        pseudo_stat_b = stat(LazyMap(lambda i: lst[i], indices[len(a):]))
        pseudo_stat = fabs(pseudo_stat_a - pseudo_stat_b)

        if pseudo_stat >= actual_stat:
            c += 1

        if verbose and i % 10 == 0:
            print 'pseudo-statistic: %f' % pseudo_stat
            print 'significance: %f' % (float(c + 1) / (i + 1))
            print '-' * 60

    significance = float(c + 1) / (shuffles + 1)

    if verbose:
        print 'significance: %f' % significance
        if betai:
            for phi in [0.01, 0.05, 0.10, 0.15, 0.25, 0.50]:
                print "prob(phi<=%f): %f" % (phi, betai(c, shuffles, phi))

    return (significance, c, shuffles)

Exemple #3

0

Afficher le fichier

Fichier : conll.py Projet : suryansh2020/GridComputing

 def sents(self, fileids=None):
     self._require(self.WORDS)
     return LazyMap(self._get_words, self._grids(fileids))

Exemple #4

0

Afficher le fichier

 def _get_words(self, fileid, sent, stem, relation, pos, strip_space,
                replace):
     xmldoc = ElementTree.parse(fileid).getroot()
     # processing each xml doc
     results = []
     for xmlsent in xmldoc.findall('.//{%s}u' % NS):
         sents = [xmlsent.get('uID'), xmlsent.get('who')]
         #print(xmlsent.get('uID'), xmlsent.get('who'))
         # select speakers
         #if speaker == 'ALL' or xmlsent.get('who') in speaker:
         for xmlword in xmlsent.findall('.//{%s}w' % NS):
             infl = None
             suffixStem = None
             suffixTag = None
             # getting replaced words
             if replace and xmlsent.find('.//{%s}w/{%s}replacement' %
                                         (NS, NS)):
                 xmlword = xmlsent.find('.//{%s}w/{%s}replacement/{%s}w' %
                                        (NS, NS, NS))
             elif replace and xmlsent.find('.//{%s}w/{%s}wk' % (NS, NS)):
                 xmlword = xmlsent.find('.//{%s}w/{%s}wk' % (NS, NS))
                 # get text
             if xmlword.text:
                 word = xmlword.text
             else:
                 word = ''
                 # strip tailing space
             if strip_space:
                 word = word.strip()
                 # stem
             '''
             if relation or stem:
                 try:
                     xmlstem = xmlword.find('.//{%s}stem' % NS)
                     word = xmlstem.text
                 except AttributeError as e:
                     pass
                     # if there is an inflection
                 try:
                     xmlinfl = xmlword.find('.//{%s}mor/{%s}mw/{%s}mk'
                                                % (NS,NS,NS))
                     word += '-' + xmlinfl.text
                 except:
                     pass
                     # if there is a suffix
                 try:
                     xmlsuffix = xmlword.find('.//{%s}mor/{%s}mor-post/{%s}mw/{%s}stem'
                                                  % (NS,NS,NS,NS))
                     suffixStem = xmlsuffix.text
                 except AttributeError:
                     suffixStem = ""
                 if suffixStem:
                     word += "~"+suffixStem
                 # pos'''
             if relation or pos:
                 try:
                     xmlpos = xmlword.findall(".//{%s}c" % NS)
                     xmlpos2 = xmlword.findall(".//{%s}s" % NS)
                     if xmlpos2 != []:
                         tag = xmlpos[0].text + ":" + xmlpos2[0].text
                     else:
                         tag = xmlpos[0].text
                 except (AttributeError, IndexError) as e:
                     tag = ""
                 try:
                     xmlsuffixpos = xmlword.findall(
                         './/{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}c' %
                         (NS, NS, NS, NS, NS))
                     xmlsuffixpos2 = xmlword.findall(
                         './/{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}s' %
                         (NS, NS, NS, NS, NS))
                     if xmlsuffixpos2:
                         suffixTag = xmlsuffixpos[
                             0].text + ":" + xmlsuffixpos2[0].text
                     else:
                         suffixTag = xmlsuffixpos[0].text
                 except:
                     pass
                 if suffixTag:
                     tag += "~" + suffixTag
                 word = (word, tag)
                 # relational
                 # the gold standard is stored in
                 # <mor></mor><mor type="trn"><gra type="grt">
             if relation == True:
                 for xmlstem_rel in xmlword.findall('.//{%s}mor/{%s}gra' %
                                                    (NS, NS)):
                     if not xmlstem_rel.get('type') == 'grt':
                         word = (word[0], word[1],
                                 xmlstem_rel.get('index') + "|" +
                                 xmlstem_rel.get('head') + "|" +
                                 xmlstem_rel.get('relation'))
                     else:
                         word = (word[0], word[1], word[2], word[0],
                                 word[1], xmlstem_rel.get('index') + "|" +
                                 xmlstem_rel.get('head') + "|" +
                                 xmlstem_rel.get('relation'))
                 try:
                     for xmlpost_rel in xmlword.findall(
                             './/{%s}mor/{%s}mor-post/{%s}gra' %
                         (NS, NS, NS)):
                         if not xmlpost_rel.get('type') == 'grt':
                             suffixStem = (suffixStem[0], suffixStem[1],
                                           xmlpost_rel.get('index') + "|" +
                                           xmlpost_rel.get('head') + "|" +
                                           xmlpost_rel.get('relation'))
                         else:
                             suffixStem = (suffixStem[0], suffixStem[1],
                                           suffixStem[2], suffixStem[0],
                                           suffixStem[1],
                                           xmlpost_rel.get('index') + "|" +
                                           xmlpost_rel.get('head') + "|" +
                                           xmlpost_rel.get('relation'))
                 except:
                     pass
             sents.append(word)
             #print(sents)
         if sent or relation:
             results.append(sents)
         else:
             results.extend(sents)
     return LazyMap(lambda x: x, results)

Exemple #5

0

Afficher le fichier

 def sents(self, fileids=None):
     return LazyMap(untagged, self.elements(fileids))

Exemple #6

0

Afficher le fichier

Fichier : conll.py Projet : VinodhSubramanian1193/NLP

 def tagged_words(self, fileids=None, tagset=None):
     self._require(self.WORDS, self.POS)
     def get_tagged_words(grid):
         return self._get_tagged_words(grid, tagset)
     return LazyConcatenation(LazyMap(get_tagged_words,
                                      self._grids(fileids)))

Exemple #7

0

Afficher le fichier

 def sents(self):
     # LazyMap from nltk.util:
     return LazyMap(lambda t: t.leaves(), self.get_trees())

Exemple #8

0

Afficher le fichier

Fichier : hmm2.py Projet : peterfeifanchen/PMP

        print("Running on dev")
        test_data = [json.loads(line) for line in open_file("twt.dev.json")]
    else:
        print("Running on test")
        test_data = [json.loads(line) for line in open_file("twt.test.json")]
    test_data = handle_lowfreq_words(vocab)(test_data)
    twitter_model = hmm.HiddenMarkovModelTagger(symbols=hmm_model.symbols,
                                                states=tagset,
                                                transitions=transition_model,
                                                outputs=emission_model,
                                                priors=init_model)

    # Compute the accuracy - we can call this, but then we just do extra decoding
    # work. What we really need is just call nltk.metrics.accuracy on the gold and
    # predicted.
    # twitter_model.test( test_data )

    # Compute the confusion matrix, technically we would be doing this twice, as
    # when computing accuracy we would've already done this. It would be more
    # optimal to modify the hmm library. But meh.
    gold = tag_list(test_data)
    unlabeled_data = LazyMap(unlabeled_words, test_data)
    predicted_labels = list(LazyMap(twitter_model.tag, unlabeled_data))
    predicted = tag_list(predicted_labels)

    acc = accuracy(gold, predicted)
    print("Accuracy: ", acc)
    cm = ConfusionMatrix(gold, predicted)
    print(cm.pretty_format(sort_by_count=True, show_percents=True,
                           truncate=25))

Exemple #9

0

Afficher le fichier

Fichier : hmm2.py Projet : peterfeifanchen/PMP

 def transform(labeled_symbols):
     return LazyMap(relabel, labeled_symbols)

Exemple #10

0

Afficher le fichier