def __init__(self, data, count, problemName=None): self.problemName = problemName self.data = [Morph(tokenize(x)) for x in data] self.count = count self.bank = FeatureBank([w for w in data]) self.maximumObservationLength = max([len(w) for w in self.data]) + 1
def restrict(self, newData): restriction = copy.copy(self) restriction.data = [ tuple(None if i == None else ( i if isinstance(i, Morph) else Morph(tokenize(i))) for i in Lex) for Lex in newData ] return restriction
def __init__(self, data, CPUs=1): self.CPUs = CPUs self.bank = FeatureBank([w for l in data for w in l if w != None] + [u'?', u'*']) self.numberOfInflections = len(data[0]) # wrap the data in Morph objects if it isn't already self.data = [ tuple(None if i == None else ( i if isinstance(i, Morph) else Morph(tokenize(i))) for i in Lex) for Lex in data ] self.maximumObservationLength = max( [len(w) for l in self.data for w in l if w != None])
def restrict(self, newData): """Creates a new version of this object which is identical but has different training data""" restriction = copy.copy(self) restriction.data = [ tuple( None if i == None else (i if isinstance(i,Morph) else Morph(tokenize(i))) for i in Lex) for Lex in newData ] return restriction
def __init__(self, data, problemName=None, bank = None, useSyllables = False, UG = None, fixedMorphology = None): self.problemName = problemName self.UG = UG self.countingProblem = problemName == "Odden_2.4_Tibetan" if bank != None: self.bank = bank else: self.bank = FeatureBank([ w for l in data for w in l if w != None ] + ([u'-'] if useSyllables else [])) self.numberOfInflections = len(data[0]) for d in data: assert len(d) == self.numberOfInflections # wrap the data in Morph objects if it isn't already self.data = [ tuple( None if i == None else (i if isinstance(i,Morph) else Morph(tokenize(i))) for i in Lex) for Lex in data ] self.maximumObservationLength = max([ len(w) for l in self.data for w in l if w != None ]) self.wordBoundaries = any([ (u'##' in w.phonemes) for l in self.data for w in l if w ]) # fixedMorphology : list of morphologies, one for each inflection # Each morphology is either None (don't fix it) or a pair of (prefix, suffix) if fixedMorphology == None: fixedMorphology = [None]*self.numberOfInflections self.fixedMorphology = fixedMorphology assert len(self.fixedMorphology) == self.numberOfInflections self.inflectionsPerObservation = sum(x is not None for xs in self.data for x in xs )/len(self.data) self.pervasiveTimeout = None self.precomputedAlignment = None
def __init__(self, phonemes): if isinstance(phonemes,unicode): phonemes = tokenize(phonemes) self.phonemes = phonemes
constantSuffix = arguments.problem.endswith('x') if arguments.problem in stimuliFromLiterature: trainingData = stimuliFromLiterature[arguments.problem][:arguments.number] if len(trainingData) < arguments.number: if 'x' in arguments.problem: assert arguments.problem == 'aax' trainingData += sampling[arguments.problem](arguments.number - len(trainingData), X='di') else: trainingData += sampling[arguments.problem](arguments.number - len(trainingData)) else: trainingData = sampling[arguments.problem](arguments.number) print u"\n".join(trainingData) surfaceLength = sum([len(tokenize(w)) for w in trainingData ]) costToSolution = {} if arguments.load != None: assert not arguments.quiet costToSolution = loadPickle(arguments.load) else: worker = UnderlyingProblem([(w,) for w in trainingData ], useSyllables = not arguments.noSyllables) solutions, costs = worker.paretoFront(arguments.depth, (arguments.top+1)//2, TEMPERATURE, useMorphology = True, morphologicalCoefficient = 4, offFront=arguments.top//2) for solution, cost in zip(solutions, costs): costToSolution[cost] = solution if arguments.save != None:
def toMorph(z): if isinstance(z, Morph): return z elif isinstance(z, (unicode, str)): return Morph(tokenize(z)) else: assert False
(u"man", u"manmandə"), # (u"kwaj",u"kwajkwajdə"), # (u"çin",u"çinçində"), (u"le", u"leledə") ] if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Learn pig Latin and Chinese") parser.add_argument( 'task', choices=["Chinese", "Latin", "Latin1", "Latin2", "Latin3"], default="Latin") parser.add_argument("-d", "--depth", default=1, type=int) arguments = parser.parse_args() examples = data[arguments.task] depth = arguments.depth leaveSketchOutput() solution = SupervisedProblem( [(Morph(tokenize(x)), Constant(0), Morph(tokenize(y))) for x, y in examples], syllables=True).solve(depth) if solution == None: print "No solution." else: for r in solution: print r
def main(): args = parse() SARC = utils.SARC_POL train_file = SARC + '\\train-balanced.csv' test_file = SARC + '\\test-balanced.csv' comment_file = SARC + '\\comments.json' # Load SARC pol/main sequences with labels. print('Load SARC data') train_seqs, test_seqs, train_labels, test_labels = utils.load_sarc_responses( train_file, test_file, comment_file, lower=args.lower) # Only use responses for this method. Ignore ancestors. train_resp = train_seqs['responses'] test_resp = test_seqs['responses'] # Split into first and second responses and their labels. # {0: list_of_first_responses, 1: list_of_second_responses} train_docs = {i: [l[i] for l in train_resp] for i in range(2)} test_docs = {i: [l[i] for l in test_resp] for i in range(2)} train_labels = { i: [2 * int(l[i]) - 1 for l in train_labels] for i in range(2) } test_labels = { i: [2 * int(l[i]) - 1 for l in test_labels] for i in range(2) } # Train a classifier on all responses in training data. We will later use this # classifier to determine for every sequence which of the 2 responses is more sarcastic. train_all_docs_tok = features.tokenize(train_docs[0] + train_docs[1]) test_all_docs_tok = features.tokenize(test_docs[0] + test_docs[1]) train_all_labels = np.array(train_labels[0] + train_labels[1]) test_all_labels = np.array(test_labels[0] + test_labels[1]) # Bongs or embeddings. if args.embed: print('Create embeddings') weights = None if args.weights == 'sif': weights = features.sif_weights(train_all_docs_tok, 1E-3) if args.weights == 'snif': weights = features.sif_weights(train_all_docs_tok, 1E-3) weights = {f: 1 - w for f, w in weights.items()} w2v = vectors.vocab2vecs( { word for doc in train_all_docs_tok + test_all_docs_tok for word in doc }, vectorfile=args.embedding) train_all_vecs = vectors.docs2vecs(train_all_docs_tok, f2v=w2v, weights=weights) test_all_vecs = vectors.docs2vecs(test_all_docs_tok, f2v=w2v, weights=weights) else: print('Create bongs') n = args.n min_count = args.min_count train_ngrams = [ sum((list(nltk.ngrams(doc, k)) for k in range(1, n + 1)), []) for doc in train_all_docs_tok ] test_ngrams = [ sum((list(nltk.ngrams(doc, k)) for k in range(1, n + 1)), []) for doc in test_all_docs_tok ] vocabulary = features.feature_vocab(train_ngrams, min_count=min_count) train_all_vecs = features.docs2bofs(train_ngrams, vocabulary) test_all_vecs = features.docs2bofs(test_ngrams, vocabulary) # Normalize? if args.normalize: normalize(train_all_vecs, copy=False) normalize(test_all_vecs, copy=False) print('Dimension of representation: %d' % train_all_vecs.shape[1]) # Evaluate this classifier on all responses. #print('Evaluate the classifier on all responses') clf = LogitCV(Cs=[10**i for i in range(-2, 3)], fit_intercept=False, cv=2, dual=np.less(*train_all_vecs.shape), solver='liblinear', n_jobs=-1, random_state=0) clf.fit(train_all_vecs, train_all_labels) #print(clf) #print(clf.predict(test_all_vecs)) #print('\tTrain acc: ', clf.score(train_all_vecs, train_all_labels)) #print('\tTest acc: ', clf.score(test_all_vecs, test_all_labels)) #print(test_all_vecs) # Get vectors for first and second responses. n_tr = int(train_all_vecs.shape[0] / 2) n_te = int(test_all_vecs.shape[0] / 2) train_vecs = { i: train_all_vecs[i * n_tr:(i + 1) * n_tr, :] for i in range(2) } test_vecs = { i: test_all_vecs[i * n_te:(i + 1) * n_te, :] for i in range(2) } #print(test_vecs) # Final evaluation. #print('Evaluate the classifier on the original dataset') hyperplane = clf.coef_[0, :] train_pred_labels = 2 * (train_vecs[0].dot(hyperplane) > train_vecs[1].dot(hyperplane)) - 1 test_pred_labels = 2 * (test_vecs[0].dot(hyperplane) > test_vecs[1].dot(hyperplane)) - 1 train_expect_labels = train_labels[0] test_expect_labels = test_labels[0] #print('\tTrain acc: ', (train_pred_labels == train_expect_labels).sum() / train_pred_labels.shape[0]) #print('\tTest acc: ', (test_pred_labels == test_expect_labels).sum() / test_pred_labels.shape[0]) joblib.dump(clf, 'model.pkl') joblib.dump(test_all_vecs, 'vecs.pkl') '''def testallvecs(test_all_vecs):