def __init__(self, data, count, problemName=None):
        self.problemName = problemName
        self.data = [Morph(tokenize(x)) for x in data]
        self.count = count
        self.bank = FeatureBank([w for w in data])

        self.maximumObservationLength = max([len(w) for w in self.data]) + 1
Beispiel #2
0
 def restrict(self, newData):
     restriction = copy.copy(self)
     restriction.data = [
         tuple(None if i == None else (
             i if isinstance(i, Morph) else Morph(tokenize(i)))
               for i in Lex) for Lex in newData
     ]
     return restriction
Beispiel #3
0
    def __init__(self, data, CPUs=1):
        self.CPUs = CPUs
        self.bank = FeatureBank([w for l in data
                                 for w in l if w != None] + [u'?', u'*'])
        self.numberOfInflections = len(data[0])
        # wrap the data in Morph objects if it isn't already
        self.data = [
            tuple(None if i == None else (
                i if isinstance(i, Morph) else Morph(tokenize(i)))
                  for i in Lex) for Lex in data
        ]

        self.maximumObservationLength = max(
            [len(w) for l in self.data for w in l if w != None])
Beispiel #4
0
 def restrict(self, newData):
     """Creates a new version of this object which is identical but has different training data"""
     restriction = copy.copy(self)
     restriction.data = [ tuple( None if i == None else (i if isinstance(i,Morph) else Morph(tokenize(i)))
                            for i in Lex)
                          for Lex in newData ]
     return restriction
Beispiel #5
0
    def __init__(self, data, problemName=None, bank = None, useSyllables = False, UG = None,
                 fixedMorphology = None):
        self.problemName = problemName
        self.UG = UG
        self.countingProblem = problemName == "Odden_2.4_Tibetan"

      
        if bank != None: self.bank = bank
        else:
            self.bank = FeatureBank([ w for l in data for w in l if w != None ] + ([u'-'] if useSyllables else []))

        self.numberOfInflections = len(data[0])
        for d in data: assert len(d) == self.numberOfInflections
        
        # wrap the data in Morph objects if it isn't already
        self.data = [ tuple( None if i == None else (i if isinstance(i,Morph) else Morph(tokenize(i)))
                             for i in Lex)
                      for Lex in data ]

        self.maximumObservationLength = max([ len(w) for l in self.data for w in l if w != None ])

        self.wordBoundaries = any([ (u'##' in w.phonemes) for l in self.data for w in l if w ])

        # fixedMorphology : list of morphologies, one for each inflection
        # Each morphology is either None (don't fix it) or a pair of (prefix, suffix)
        if fixedMorphology == None: fixedMorphology = [None]*self.numberOfInflections
        self.fixedMorphology = fixedMorphology
        assert len(self.fixedMorphology) == self.numberOfInflections

        self.inflectionsPerObservation = sum(x is not None
                                             for xs in self.data for x in xs )/len(self.data)

        self.pervasiveTimeout = None

        self.precomputedAlignment = None
Beispiel #6
0
 def __init__(self, phonemes):
     if isinstance(phonemes,unicode): phonemes = tokenize(phonemes)
     self.phonemes = phonemes
Beispiel #7
0
    constantSuffix = arguments.problem.endswith('x')

    if arguments.problem in stimuliFromLiterature:
        trainingData = stimuliFromLiterature[arguments.problem][:arguments.number]
        if len(trainingData) < arguments.number:
            if 'x' in arguments.problem:
                assert arguments.problem == 'aax'
                trainingData += sampling[arguments.problem](arguments.number - len(trainingData),
                                                            X='di')
            else:
                trainingData += sampling[arguments.problem](arguments.number - len(trainingData))
    else:
        trainingData = sampling[arguments.problem](arguments.number)
    
    print u"\n".join(trainingData)
    surfaceLength = sum([len(tokenize(w)) for w in trainingData ])

    costToSolution = {}
    if arguments.load != None:
        assert not arguments.quiet
        costToSolution = loadPickle(arguments.load)
    else:
        worker = UnderlyingProblem([(w,) for w in trainingData ],
                                   useSyllables = not arguments.noSyllables)
        solutions, costs = worker.paretoFront(arguments.depth, (arguments.top+1)//2, TEMPERATURE,
                                              useMorphology = True,
                                              morphologicalCoefficient = 4,
                                              offFront=arguments.top//2)
        for solution, cost in zip(solutions, costs): costToSolution[cost] = solution

    if arguments.save != None:
Beispiel #8
0
 def toMorph(z):
     if isinstance(z, Morph): return z
     elif isinstance(z, (unicode, str)): return Morph(tokenize(z))
     else: assert False
Beispiel #9
0
    (u"man", u"manmandə"),
    #                   (u"kwaj",u"kwajkwajdə"),
    #                   (u"çin",u"çinçində"),
    (u"le", u"leledə")
]

if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description="Learn pig Latin and Chinese")
    parser.add_argument(
        'task',
        choices=["Chinese", "Latin", "Latin1", "Latin2", "Latin3"],
        default="Latin")
    parser.add_argument("-d", "--depth", default=1, type=int)
    arguments = parser.parse_args()

    examples = data[arguments.task]
    depth = arguments.depth

    leaveSketchOutput()
    solution = SupervisedProblem(
        [(Morph(tokenize(x)), Constant(0), Morph(tokenize(y)))
         for x, y in examples],
        syllables=True).solve(depth)

    if solution == None:
        print "No solution."
    else:
        for r in solution:
            print r
Beispiel #10
0
def main():
    args = parse()
    SARC = utils.SARC_POL
    train_file = SARC + '\\train-balanced.csv'
    test_file = SARC + '\\test-balanced.csv'
    comment_file = SARC + '\\comments.json'
    # Load SARC pol/main sequences with labels.
    print('Load SARC data')
    train_seqs, test_seqs, train_labels, test_labels = utils.load_sarc_responses(
        train_file, test_file, comment_file, lower=args.lower)

    # Only use responses for this method. Ignore ancestors.
    train_resp = train_seqs['responses']
    test_resp = test_seqs['responses']

    # Split into first and second responses and their labels.
    # {0: list_of_first_responses, 1: list_of_second_responses}
    train_docs = {i: [l[i] for l in train_resp] for i in range(2)}
    test_docs = {i: [l[i] for l in test_resp] for i in range(2)}
    train_labels = {
        i: [2 * int(l[i]) - 1 for l in train_labels]
        for i in range(2)
    }
    test_labels = {
        i: [2 * int(l[i]) - 1 for l in test_labels]
        for i in range(2)
    }

    # Train a classifier on all responses in training data. We will later use this
    # classifier to determine for every sequence which of the 2 responses is more sarcastic.
    train_all_docs_tok = features.tokenize(train_docs[0] + train_docs[1])
    test_all_docs_tok = features.tokenize(test_docs[0] + test_docs[1])
    train_all_labels = np.array(train_labels[0] + train_labels[1])
    test_all_labels = np.array(test_labels[0] + test_labels[1])

    # Bongs or embeddings.
    if args.embed:
        print('Create embeddings')
        weights = None
        if args.weights == 'sif':
            weights = features.sif_weights(train_all_docs_tok, 1E-3)
        if args.weights == 'snif':
            weights = features.sif_weights(train_all_docs_tok, 1E-3)
            weights = {f: 1 - w for f, w in weights.items()}
        w2v = vectors.vocab2vecs(
            {
                word
                for doc in train_all_docs_tok + test_all_docs_tok
                for word in doc
            },
            vectorfile=args.embedding)
        train_all_vecs = vectors.docs2vecs(train_all_docs_tok,
                                           f2v=w2v,
                                           weights=weights)
        test_all_vecs = vectors.docs2vecs(test_all_docs_tok,
                                          f2v=w2v,
                                          weights=weights)
    else:
        print('Create bongs')
        n = args.n
        min_count = args.min_count
        train_ngrams = [
            sum((list(nltk.ngrams(doc, k)) for k in range(1, n + 1)), [])
            for doc in train_all_docs_tok
        ]
        test_ngrams = [
            sum((list(nltk.ngrams(doc, k)) for k in range(1, n + 1)), [])
            for doc in test_all_docs_tok
        ]
        vocabulary = features.feature_vocab(train_ngrams, min_count=min_count)
        train_all_vecs = features.docs2bofs(train_ngrams, vocabulary)
        test_all_vecs = features.docs2bofs(test_ngrams, vocabulary)

    # Normalize?
    if args.normalize:
        normalize(train_all_vecs, copy=False)
        normalize(test_all_vecs, copy=False)
    print('Dimension of representation: %d' % train_all_vecs.shape[1])

    # Evaluate this classifier on all responses.
    #print('Evaluate the classifier on all responses')
    clf = LogitCV(Cs=[10**i for i in range(-2, 3)],
                  fit_intercept=False,
                  cv=2,
                  dual=np.less(*train_all_vecs.shape),
                  solver='liblinear',
                  n_jobs=-1,
                  random_state=0)
    clf.fit(train_all_vecs, train_all_labels)
    #print(clf)
    #print(clf.predict(test_all_vecs))
    #print('\tTrain acc: ', clf.score(train_all_vecs, train_all_labels))
    #print('\tTest acc: ', clf.score(test_all_vecs, test_all_labels))
    #print(test_all_vecs)
    # Get vectors for first and second responses.
    n_tr = int(train_all_vecs.shape[0] / 2)
    n_te = int(test_all_vecs.shape[0] / 2)
    train_vecs = {
        i: train_all_vecs[i * n_tr:(i + 1) * n_tr, :]
        for i in range(2)
    }
    test_vecs = {
        i: test_all_vecs[i * n_te:(i + 1) * n_te, :]
        for i in range(2)
    }
    #print(test_vecs)
    # Final evaluation.
    #print('Evaluate the classifier on the original dataset')
    hyperplane = clf.coef_[0, :]
    train_pred_labels = 2 * (train_vecs[0].dot(hyperplane) >
                             train_vecs[1].dot(hyperplane)) - 1
    test_pred_labels = 2 * (test_vecs[0].dot(hyperplane) >
                            test_vecs[1].dot(hyperplane)) - 1
    train_expect_labels = train_labels[0]
    test_expect_labels = test_labels[0]
    #print('\tTrain acc: ', (train_pred_labels == train_expect_labels).sum() / train_pred_labels.shape[0])
    #print('\tTest acc: ', (test_pred_labels == test_expect_labels).sum() / test_pred_labels.shape[0])
    joblib.dump(clf, 'model.pkl')
    joblib.dump(test_all_vecs, 'vecs.pkl')
    '''def testallvecs(test_all_vecs):