def check_terminals(): from parse import parse_django, unescape grammar, parse_trees = parse_django( '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/en-django/all.code' ) annot_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/en-django/all.anno' unique_terminals = set() invalid_terminals = set() for i, line in enumerate(open(annot_file)): parse_tree = parse_trees[i] utterance = line.strip() leaves = parse_tree.get_leaves() # tokens = set(nltk.word_tokenize(utterance)) leave_tokens = [l.label for l in leaves if l.label] not_included = [] for leaf_token in leave_tokens: leaf_token = str(leaf_token) leaf_token = unescape(leaf_token) if leaf_token not in utterance: not_included.append(leaf_token) if len(leaf_token) <= 15: unique_terminals.add(leaf_token) else: invalid_terminals.add(leaf_token) else: if isinstance(leaf_token, str): print leaf_token
def check_terminals(): from parse import parse_django, unescape grammar, parse_trees = parse_django( '/Users/carlosgemmell/Documents/projects/ReCode/data/all.code') annot_file = '/Users/carlosgemmell/Documents/projects/ReCode/data/all.anno' unique_terminals = set() invalid_terminals = set() for i, line in enumerate(open(annot_file)): parse_tree = parse_trees[i] utterance = line.strip() leaves = parse_tree.get_leaves() # tokens = set(nltk.word_tokenize(utterance)) leave_tokens = [l.label for l in leaves if l.label] not_included = [] for leaf_token in leave_tokens: leaf_token = str(leaf_token) leaf_token = unescape(leaf_token) if leaf_token not in utterance: not_included.append(leaf_token) if len(leaf_token) <= 15: unique_terminals.add(leaf_token) else: invalid_terminals.add(leaf_token) else: if isinstance(leaf_token, str): print leaf_token
def check_terminals(): from parse import parse_django, unescape grammar, parse_trees = parse_django('/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/en-django/all.code') annot_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/en-django/all.anno' unique_terminals = set() invalid_terminals = set() for i, line in enumerate(open(annot_file)): parse_tree = parse_trees[i] utterance = line.strip() leaves = parse_tree.get_leaves() # tokens = set(nltk.word_tokenize(utterance)) leave_tokens = [l.label for l in leaves if l.label] not_included = [] for leaf_token in leave_tokens: leaf_token = str(leaf_token) leaf_token = unescape(leaf_token) if leaf_token not in utterance: not_included.append(leaf_token) if len(leaf_token) <= 15: unique_terminals.add(leaf_token) else: invalid_terminals.add(leaf_token) else: if isinstance(leaf_token, str): print leaf_token
def test_unescape(self): assert parse.unescape(r'(HASHES (1 2 3 4\))') == '(HASHES (1 2 3 4))' assert parse.unescape(r'(HASHES \(1 2 3 4\))') == r'(HASHES \(1 2 3 4))' assert parse.unescape('TEST 123') == 'TEST 123' assert parse.unescape(r'\(HASHES 1\\)') == r'\(HASHES 1\)'