def parse(sentence, use_cache=True, parser='stanford'): cache_key = "parse_trees_{0}".format(parser) valid_lines = None if use_cache: cache_attempt = cache_get(cache_key, sentence) if cache_attempt: valid_lines = cache_attempt if valid_lines is None: if parser == "stanford": response = parse_stanford(sentence, use_cache=use_cache) elif parser == "malt": response = parse_malt(sentence, use_cache=use_cache) else: return [] valid_lines = [line for line in response.split("\n") if len(line) > 2 and line[0] == "(" and line[-1] == ")"] if use_cache: cache_set(cache_key, sentence, valid_lines) # throw away the garbgage we don't want from the parser's response. # this could probably get us in trouble since it'll hide errors etc, # but we got deadlines.... trees = [ParentedTree.parse(line) for line in valid_lines] return trees
def gen(files): for f in files: with open(f) as fi: #set_trace() #leaves = ParentedTree.parse(fi.read()).leaves() pos = ParentedTree.parse(fi.read()).pos() yield makeRow(getLocalContext(pos), f)
def get_sentence_posteriors(sentence, iterations=1, extra_meaning=None): meaning_probs = {} # parse sentence with charniak and apply surgeries print 'parsing ...' modparse = get_modparse(sentence) t = ParentedTree.parse(modparse) print '\n%s\n' % t.pprint() num_ancestors = count_lmk_phrases(t) - 1 for _ in xrange(iterations): (lmk, _, _), (rel, _, _) = get_meaning(num_ancestors=num_ancestors) meaning = m2s(lmk,rel) if meaning not in meaning_probs: ps = get_tree_probs(t, lmk, rel)[0] # print "Tree probs: ", zip(ps,rls) meaning_probs[meaning] = np.prod(ps) print '.' if extra_meaning: meaning = m2s(*extra_meaning) if meaning not in meaning_probs: ps = get_tree_probs(t, lmk, rel)[0] # print "Tree prob: ", zip(ps,rls) meaning_probs[meaning] = np.prod(ps) print '.' summ = sum(meaning_probs.values()) for key in meaning_probs: meaning_probs[key] /= summ return meaning_probs.items()
def get_modparse(sentence): """returns the modified parse tree for a sentence""" sp_db = SentenceParse.get_sentence_parse(sentence) try: res = sp_db.all()[0] parsetree = res.original_parse modparsetree = res.modified_parse except: print "parse.py: 103: " + sentence parses = parse_sentences([sentence]) if len(parses) == 0: raise ParseError(printcolors.WARNING + ('ParseError: a sentence was empty')) modparses = modify_parses(parses) for i,chunk in enumerate(modparses[:]): for j,modparse in enumerate(chunk): if 'LANDMARK-PHRASE' in modparse: modparses[i] = modparse parses[i] = parses[i][j] break if isinstance(modparses[i],list): modparses[i] = modparses[i][0] parses[i] = parses[i][0] parsetree = parses[0] modparsetree = modparses[0] try: SentenceParse.add_sentence_parse(sentence, parsetree, modparsetree) except Exception as e: print e if count_lmk_phrases(ParentedTree.parse(modparsetree)) < 1: raise ParseError(printcolors.WARNING + ('ParseError: Parse contained no Landmark phrase.\nSentence: %s\nParse: %s\nModparse: %s' % (sentence,parsetree,modparsetree))) return parsetree, modparsetree
def get_sentence_meaning_likelihood(sentence, lmk, rel): modparse = get_modparse(sentence) t = ParentedTree.parse(modparse) print '\n%s\n' % t.pprint() probs, entropies, lrpc, tps = get_tree_probs(t, lmk, rel) if np.prod(probs) == 0.0: logger('ERROR: Probability product is 0 for sentence: %s, lmk: %s, rel: %s, probs: %s' % (sentence, lmk, rel, str(probs))) return np.prod(probs), sum(entropies), lrpc, tps
def get_all_sentence_posteriors(sentence, meanings, golden=False, printing=True): print 'parsing ...' _, modparse = get_modparse(sentence) t = ParentedTree.parse(modparse) print '\n%s\n' % t.pprint() num_ancestors = count_lmk_phrases(t) - 1 # logger(len(meanings)) lmks, rels = zip(*meanings) lmks = set(lmks) rels = set(rels) # logger('num things ' + str(len(lmks))+' '+str(len(rels))) syms = ['\\', '|', '/', '-'] sys.stdout.write('processing...\\') sys.stdout.flush() posteriors = {} for i,lmk in enumerate(lmks): if lmk.get_ancestor_count() != num_ancestors: p = 0 else: ps = get_tree_probs(t[1], lmk, golden=golden, printing=printing)[0] p = np.prod(ps) posteriors[lmk] = p sys.stdout.write("\b%s" % syms[i % len(syms)]) sys.stdout.flush() for i,rel in enumerate(rels): ps = get_tree_probs(t[0], rel=rel, golden=golden, printing=printing)[0] posteriors[rel] = np.prod(ps) sys.stdout.write("\b%s" % syms[i % len(syms)]) sys.stdout.flush() for j in range(50): sys.stdout.write("\b.%s" % syms[(i+j) % len(syms)]) sys.stdout.flush() print # for meaning in meanings: # lmk,rel = meaning # if lmk.get_ancestor_count() != num_ancestors: # p = 0 # else: # ps = get_tree_probs(t, lmk, rel, printing=False)[0] # p = np.prod(ps) # posteriors.append(p) # print p, lmk, lmk.ori_relations, rel, (rel.distance, rel.measurement.best_degree_class, rel.measurement.best_distance_class ) if hasattr(rel,'measurement') else 'No measurement' return posteriors
def get_all_sentence_posteriors(sentence, meanings): print 'parsing ...' modparse = get_modparse(sentence) t = ParentedTree.parse(modparse) print '\n%s\n' % t.pprint() num_ancestors = count_lmk_phrases(t) - 1 posteriors = [] for meaning in meanings: lmk,rel = meaning if lmk.get_ancestor_count() != num_ancestors: p = 0 else: ps = get_tree_probs(t, lmk, rel)[0] p = np.prod(ps) posteriors.append(p) return posteriors
def get_sentence_posteriors(sentence, iterations=1): probs = [] meanings = [] # parse sentence with charniak and apply surgeries print 'parsing ...' modparse = get_modparse(sentence) t = ParentedTree.parse(modparse) print '\n%s\n' % t.pprint() num_ancestors = count_lmk_phrases(t) - 1 for _ in xrange(iterations): meaning = get_meaning(num_ancestors=num_ancestors) lmk, rel = meaning probs.append(get_tree_prob(t, *meaning)) meanings.append(m2s(lmk,rel)) print '.' probs = np.array(probs) / sum(probs) return uniquify_distribution(meanings, probs)
args = parser.parse_args() reader = csv.reader(args.csvfile, lineterminator="\n") next(reader) # skip headers for i, row in enumerate(reader, start=1): print "sentence", i # unpack row xloc, yloc, sentence, parse, modparse = row # convert variables to the right types xloc = float(xloc) yloc = float(yloc) loc = (xloc, yloc) parse = ParentedTree.parse(parse) modparse = ParentedTree.parse(modparse) # how many ancestors should the sampled landmark have? num_ancestors = count_lmk_phrases(modparse) - 1 # sample `args.iterations` times for each sentence for _ in xrange(args.iterations): lmk, rel = get_meaning(loc, num_ancestors) if args.verbose: print "utterance:", repr(sentence) print "location: %s" % repr(loc) print "landmark: %s (%s)" % (lmk, lmk_id(lmk)) print "relation: %s" % rel_type(rel) print "parse:"
def __init__(self, parse_tree): try: self.tree = ParentedTree.parse(parse_tree) except: self.tree = None
if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() # parser.add_argument('-n', '--num_iterations', type=int, default=1) # parser.add_argument('-l', '--location', type=Point) # parser.add_argument('--consistent', action='store_true') parser.add_argument('sentence') args = parser.parse_args() scene, speaker = construct_training_scene() print 'parsing ...' modparse = get_modparse(args.sentence) t = ParentedTree.parse(modparse) print '\n%s\n' % t.pprint() print_tree_entropy(t) raw_input() parts = build_meaning(t) for part in parts: print "Suggested for", part[0] items = sorted(part[1].items(), key=lambda x: x[1],reverse=True) for item in items: print ' ',rjust(item[0],40),item[1] print
def stanford_tree_reader(nlp): all_sentences = [] for s in nlp["sentences"]: all_sentences.append(ParentedTree.parse(s["parsetree"])) return all_sentences
next(reader) # skip headers unique_sentences = {} for i,row in enumerate(reader, start=1): print 'sentence', i # unpack row xloc, yloc, sentence, parse, modparse = row unique_sentences[sentence] = (parse, modparse) # convert variables to the right types xloc = float(xloc) yloc = float(yloc) loc = (xloc, yloc) parse = ParentedTree.parse(parse) modparse = ParentedTree.parse(modparse) # how many ancestors should the sampled landmark have? num_ancestors = count_lmk_phrases(modparse) - 1 if num_ancestors == -1: print 'Failed to parse %d [%s] [%s] [%s]' % (i, sentence, parse, modparse) continue # sample `args.iterations` times for each sentence for _ in xrange(args.iterations): lmk, rel = get_meaning(loc, num_ancestors) lmk, _, _ = lmk rel, _, _ = rel
utils.scene.set_scene(scene,speaker) table = scene.landmarks['table'].representation.rect t_min = table.min_point t_max = table.max_point t_w = table.width t_h = table.height xloc,yloc = random()*t_w+t_min.x, random()*t_h+t_min.y trajector = Landmark( 'point', PointRepresentation(Vec2(xloc,yloc)), None, Landmark.POINT) sentence, rel, lmk = speaker.describe(trajector, scene, False, 1) parsestring, modparsestring = get_modparse(sentence) unique_sentences[sentence] = (parsestring, modparsestring) # convert variables to the right types loc = (xloc, yloc) parse = ParentedTree.parse(parsestring) modparse = ParentedTree.parse(modparsestring) # how many ancestors should the sampled landmark have? num_ancestors = count_lmk_phrases(modparse) - 1 if num_ancestors == -1: print 'Failed to parse %d [%s] [%s] [%s]' % (i, sentence, parse, modparse) continue assert(not isinstance(lmk, tuple)) assert(not isinstance(rel, tuple)) if args.verbose: print 'utterance:', repr(sentence) print 'location: %s' % repr(loc)
def train( meaning, sentence, update=1, printing=False): lmk,rel = meaning _, modparse = get_modparse(sentence) t = ParentedTree.parse(modparse) train_rec( tree=t, lmk=lmk, rel=rel, update=update, printing=printing)