Beispiel #1
0
def _get_parse_from_strings(feature_table, constraint_set_string, words):
    lexicon = Lexicon(words, feature_table)

    constraint_set = ConstraintSet.load_from_printed_string_representation(constraint_set_string, feature_table)

    grammar = Grammar(feature_table, constraint_set, lexicon)
    return grammar.get_all_outputs_grammar()
Beispiel #2
0
class TestingParserSuite(unittest.TestCase):
    """ this test case is designed test the parser and related function: get_range and generate
    see: https://taucompling.atlassian.net/wiki/display/OTML/Testing+parser+suite
    """

    def setUp(self):
        self.feature_table = FeatureTable.load(get_feature_table_fixture("a_b_and_son_feature_table.json"))
        self.constraint_set = ConstraintSet.load(get_constraint_set_fixture("no_bb_Max_Dep_constraint_set.json"),
                                                  self.feature_table)
        self.corpus = Corpus.load(get_corpus_fixture("testing_parser_suite_corpus.txt"))
        self.lexicon = Lexicon(self.corpus.get_words(),self.feature_table)
        self.grammar = Grammar(self.feature_table, self.constraint_set, self.lexicon)
        self.bb = Word("bb", self.feature_table)
        self.bab = Word("bab", self.feature_table)
        self.abba = Word("abba", self.feature_table)
        self.ababa = Word("ababa", self.feature_table)


    def test_generate(self):
        self.assertEqual(self.grammar.generate(self.bb), {"bab"})
        self.assertEqual(self.grammar.generate(self.bab), {"bab"})

        self.assertEqual(self.grammar.generate(self.abba), {"ababa"})
        self.assertEqual(self.grammar.generate(self.ababa),  {"ababa"})


    def test_parser(self):
        traversable_hypothesis = TraversableGrammarHypothesis(self.grammar, ["bb"])
        self.assertEqual(traversable_hypothesis.parse_data(), {'bb': set()})

        traversable_hypothesis = TraversableGrammarHypothesis(self.grammar, ["ababa"])
        self.assertEqual(traversable_hypothesis.parse_data(), {"ababa": set([(self.abba, 1), (self.ababa, 1)])})
Beispiel #3
0
def factorize(G: Grammar):
    def lcp(sym):
        prefix_len, prefix = 0, None
        d = set()
        t = Trie([x for x in (G.terminals + G.nonTerminals)])
        for production in sym.productions:
            _, body = production
            p = t.prefix_query(body)
            if p:
                if len(p) > prefix_len:
                    prefix = p
                    prefix_len = len(p)
            t.insert(body)
        if prefix:
            for production in sym.productions:
                if all(prefix[i] == production.Right[i]
                       for i in range(prefix_len)):
                    d.add(production)
        print(d)
        return prefix, d

    stack = [x for x in G.nonTerminals]
    while stack:
        sym = stack.pop()
        prefix, productions = lcp(sym)
        change = not prefix is None
        if change:
            new_t = G.NonTerminal(f'{sym.Name}^')
            sym %= prefix + new_t
            for p in productions:
                remainder = p.Right[len(prefix)::]
                remainder = Sentence(*remainder) if remainder else G.Epsilon
                new_t %= remainder
                G.Productions.remove(p)
Beispiel #4
0
def remove_left_recursion(G: Grammar):
    def find_recursive_production(t: NonTerminal):
        recursives, non_recursives = list(), list()
        for production in t.productions:
            head, body = production
            if len(body) > 1 and head == body[0]:
                recursives.append(production)
            else:
                non_recursives.append(production)
        return recursives, non_recursives

    def remove_redundant_productions(G: Grammar):
        for production in G.Productions:
            while G.Productions.count(production) > 1:
                G.Productions.remove(production)

    for nt in G.nonTerminals:
        recursives, non_recursives = find_recursive_production(nt)
        if recursives:
            new_symbol = G.NonTerminal(f'{nt}^')
            for production in non_recursives:
                G.Productions.remove(production)
                body = production.Right
                nt %= body + new_symbol
            for production in recursives:
                alpha = Sentence(*production.Right[1::])
                G.Productions.remove(production)
                new_symbol %= alpha + new_symbol | G.Epsilon

    remove_redundant_productions(G)
    def clear_modules_caching(self):

        if True:
            Grammar.clear_caching()
            ConstraintSet.clear_caching()
            Constraint.clear_caching()
            Word.clear_caching()

        diagnostics_flag = False
        if diagnostics_flag:
            def object_size_in_mb(object_):
                from pympler.asizeof import asizeof

                return int((asizeof(object_) / (1024 ** 2)))

            import grammar.grammar
            import grammar.constraint_set
            import grammar.constraint
            import grammar.lexicon

            outputs_by_constraint_set_and_word_size = object_size_in_mb(
                grammar.grammar.outputs_by_constraint_set_and_word)
            grammar_transducers_size = object_size_in_mb(grammar.grammar.grammar_transducers)
            constraint_set_transducers_size = object_size_in_mb(grammar.constraint_set.constraint_set_transducers)
            constraint_transducers_size = object_size_in_mb(grammar.constraint.constraint_transducers)
            word_transducers_size = object_size_in_mb(grammar.lexicon.word_transducers)

            logger.info(
                "asizeof outputs_by_constraint_set_and_word: {} MB".format(outputs_by_constraint_set_and_word_size))
            logger.info("length outputs_by_constraint_set_and_word: {}".format(
                len(grammar.grammar.outputs_by_constraint_set_and_word)))

            logger.info("asizeof grammar_transducers: {} MB".format(grammar_transducers_size))
            logger.info("length grammar_transducers: {}".format(len(grammar.grammar.grammar_transducers)))
            logger.info("asizeof constraint_set_transducers: {} MB".format(constraint_set_transducers_size))
            logger.info("asizeof constraint_transducers: {} MB".format(constraint_transducers_size))

            logger.info("asizeof word_transducers: {} MB".format(word_transducers_size))
            logger.info("length word_transducers: {}".format(len(grammar.lexicon.word_transducers)))

            sum_asizeof = outputs_by_constraint_set_and_word_size + grammar_transducers_size + \
                          constraint_set_transducers_size + constraint_transducers_size + \
                          word_transducers_size

            logger.info("sum asizeof: {} MB".format(sum_asizeof))

            logger.info("Memory usage: {} MB".format(self._get_memory_usage()))
class TestObjectCaching(unittest.TestCase):

    def setUp(self):
        self.feature_table = FeatureTable.load(get_feature_table_fixture("a_b_and_son_feature_table.json"))
        self.constraint_set_filename = get_constraint_set_fixture("no_bb_Max_Dep_constraint_set.json")
        self.corpus = Corpus.load(get_corpus_fixture("small_ab_corpus.txt"))
        self.word = Word("abababa",self.feature_table)
        self.constraint = PhonotacticConstraint([{'son': '+'}, {'son': '+'}], self.feature_table)
        self.constraint_set = ConstraintSet.load(self.constraint_set_filename, self.feature_table)
        self.lexicon = Lexicon(self.corpus.get_words(), self.feature_table)
        self.grammar = Grammar(self.feature_table, self.constraint_set, self.lexicon)

    def test_constraint_transducer_caching(self):
        max_constraint = MaxConstraint([{'son': '+'}], self.feature_table)
        #deepcopy(max_constraint)
        orig_transducer = max_constraint.get_transducer()
        max_constraint.augment_feature_bundle()
        new_transducer = max_constraint.get_transducer()
        self.assertEqual(id(orig_transducer), id(new_transducer))

    #def test_word_caching(self):
    #    get_transducer(self.word)
    #    get_transducer(self.word)
    #    get_transducer(self.word)
    #
    #def test_constraint_caching(self):
    #    get_transducer(self.constraint)
    #    get_transducer(self.constraint)
    #    get_transducer(self.constraint)
    #
    #
    #def test_constraint_set_caching(self):
    #    get_transducer(self.constraint_set)
    #    get_transducer(self.constraint_set)
    #    get_transducer(self.constraint_set)
    #
    #def test_grammar_caching(self):
    #    get_transducer(self.grammar)
    #    get_transducer(self.grammar)
    #    get_transducer(self.grammar)

    def test_generate_caching(self):
        word = Word("bbb", self.feature_table)
        word_outputs = self.grammar.generate(word)
        from grammar.grammar import outputs_by_constraint_set_and_word

        constraint_set_and_word_key = str(self.grammar.constraint_set) + str(word)

        self.assertEqual(set(outputs_by_constraint_set_and_word[constraint_set_and_word_key]),
                         set(word_outputs))

    def test_parse_data_caching(self):
        pass


    def test_start_from_middle(self):
        print()
        print(pickle.dumps(configurations))
Beispiel #7
0
def train(args):
    train_set = Dataset.from_bin_file(args.train_file)
    if args.dev_file:
        dev_set = Dataset.from_bin_file(args.dev_file)
    else: dev_set = Dataset(examples=[])
    
    vocab = pickle.load(open(args.vocab, 'rb'))
    grammar = Grammar.from_text(open(args.asdl_file).read())
    # transition_system = Registrable.by_name(args.transition_system)(grammar)
    transition_system = TurkTransitionSystem(grammar)
    
    parser = ASNParser(args, transition_system, vocab)    
    nn_utils.glorot_init(parser.parameters())

    optimizer = optim.Adam(parser.parameters(), lr=args.lr)
    best_acc = 0.0
    log_every = args.log_every
    
    train_begin = time.time()
    for epoch in range(1, args.max_epoch + 1):
        train_iter = 0
        loss_val = 0.
        epoch_loss = 0.

        parser.train()

        epoch_begin = time.time()
        for batch_example in train_set.batch_iter(batch_size=args.batch_size, shuffle=False):
            optimizer.zero_grad()
            loss = parser.score(batch_example)
            loss_val += torch.sum(loss).data.item()
            epoch_loss += torch.sum(loss).data.item()
            loss = torch.mean(loss)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(parser.parameters(), args.clip_grad)

            optimizer.step()
            train_iter += 1
            if train_iter % log_every == 0:
                print("[epoch {}, step {}] loss: {:.3f}".format(epoch, train_iter, loss_val / (log_every * args.batch_size )))
                loss_val = 0.

        # print(epoch, 'Train loss', '{:.3f}'.format(epoch_loss / len(train_set)), 'time elapsed %d' % (time.time() - epoch_begin))
        print('[epoch {}] train loss {:.3f}, epoch time {:.0f}, total time {:.0f}'.format(epoch, epoch_loss / len(train_set), time.time() - epoch_begin, time.time() - train_begin) )
        if epoch > args.run_val_after:
            eval_begin = time.time()
            parser.eval()
            with torch.no_grad():
                parse_results = [parser.naive_parse(ex) for ex in dev_set]
            match_results = [transition_system.compare_ast(e.tgt_ast, r) for e, r in zip(dev_set, parse_results)]
            match_acc = sum(match_results) * 1. / len(match_results)
            # print('Eval Acc', match_acc)
            print('[epoch {}] eval acc {:.3f}, eval time {:.0f}'.format(epoch, match_acc, time.time() - eval_begin))
            
            if match_acc >= best_acc:
                best_acc = match_acc
                parser.save(args.save_to)
 def setUp(self):
     self.feature_table = FeatureTable.load(get_feature_table_fixture("a_b_and_son_feature_table.json"))
     self.constraint_set_filename = get_constraint_set_fixture("no_bb_Max_Dep_constraint_set.json")
     self.corpus = Corpus.load(get_corpus_fixture("small_ab_corpus.txt"))
     self.word = Word("abababa",self.feature_table)
     self.constraint = PhonotacticConstraint([{'son': '+'}, {'son': '+'}], self.feature_table)
     self.constraint_set = ConstraintSet.load(self.constraint_set_filename, self.feature_table)
     self.lexicon = Lexicon(self.corpus.get_words(), self.feature_table)
     self.grammar = Grammar(self.feature_table, self.constraint_set, self.lexicon)
Beispiel #9
0
def save_words(gram: Grammar, output_path: str):
    """
    Writes all words of a language, described by the grammar `gram`.

    :param gram: the grammar of a language
    :param output_path: path to the output file
    """

    words = gram.generate_words()

    with open(output_path, 'w') as output:
        print('Words in language of {grammar_name}'.format(
            grammar_name=repr(gram.get_name())),
              file=output)
        print('It has {words_count} words:\n'.format(words_count=len(words)),
              file=output)

        for word in sorted(words):
            print(word if word != '' else '#eps', file=output)
 def __init__(self,
              segmentLength=20,
              paaSize=5,
              alphabetSize=3,
              upperBound=100,
              lowerBound=-100):
     self.segmentLength = segmentLength
     self.paaSize = paaSize
     self.alphabetSize = alphabetSize
     self.upperBound = upperBound
     self.lowerBound = lowerBound
     self.sax = SAX(wordSize=paaSize,
                    alphabetSize=alphabetSize,
                    lowerBound=lowerBound,
                    upperBound=upperBound,
                    epsilon=1e-6)
     self.grammar = Grammar()
     self.segmentIndexes = []
     self.rule_set = []
     self.tsCount = 0
Beispiel #11
0
 def setUp(self):
     self.feature_table = FeatureTable.load(get_feature_table_fixture("a_b_and_son_feature_table.json"))
     self.constraint_set = ConstraintSet.load(get_constraint_set_fixture("no_bb_Max_Dep_constraint_set.json"),
                                               self.feature_table)
     self.corpus = Corpus.load(get_corpus_fixture("testing_parser_suite_corpus.txt"))
     self.lexicon = Lexicon(self.corpus.get_words(),self.feature_table)
     self.grammar = Grammar(self.feature_table, self.constraint_set, self.lexicon)
     self.bb = Word("bb", self.feature_table)
     self.bab = Word("bab", self.feature_table)
     self.abba = Word("abba", self.feature_table)
     self.ababa = Word("ababa", self.feature_table)
Beispiel #12
0
def regex_to_grammar(regex: Regex):
    automaton = regex.automaton
    G = Grammar()
    start = G.NonTerminal(f'A{automaton.start}', True)
    n = {}
    n[automaton.start] = start
    for i in range(automaton.states):
        if i != automaton.start:
            n[i] = G.NonTerminal(f'A{i}')
    t = {}
    for sym in automaton.vocabulary:
        t[sym] = G.Terminal(sym)

    for src, d in automaton.transitions.items():
        for sym, dest in d.items():
            nt = n[src]
            nt1 = n[dest[0]]
            s = t[sym]
            nt %= s + nt1 if nt1 != nt else s
    for f in automaton.finals:
        nt = n[f]
        nt %= G.Epsilon
    return G
Beispiel #13
0
    def __init__(self, regex, ignore_white_space=True):
        self.regex = regex
        self.G = Grammar()
        E = self.G.NonTerminal('E', True)
        T, F, A, X, Y, Z = self.G.NonTerminals('T F A X Y Z')
        pipe, star, opar, cpar, symbol, epsilon, plus, minus, obrack, cbrack, question = self.G.Terminals(
            '| * ( ) symbol ε + - [ ] ?')

        E %= T + X, lambda h, s: s[2], None, lambda h, s: s[1]
        X %= pipe + E, lambda h, s: UnionNode(h[0], s[2])
        X %= self.G.Epsilon, lambda h, s: h[0]
        T %= F + Y, lambda h, s: s[2], None, lambda h, s: s[1]
        Y %= T, lambda h, s: ConcatNode(h[0], s[1])
        Y %= self.G.Epsilon, lambda h, s: h[0]
        F %= A + Z, lambda h, s: s[2], None, lambda h, s: s[1]
        Z %= star, lambda h, s: ClosureNode(h[0])
        Z %= plus, lambda h, s: PositiveClosureNode(h[0])
        Z %= question, lambda h, s: QuestionNode(h[0])
        Z %= self.G.Epsilon, lambda h, s: h[0]
        A %= symbol, lambda h, s: SymbolNode(s[1])
        A %= epsilon, lambda h, s: EpsilonNode(s[1])
        A %= opar + E + cpar, lambda h, s: s[2]

        self.automaton = self._build_automaton(regex, ignore_white_space)
Beispiel #14
0
def make_dataset():

    grammar = Grammar.from_text(open('data/turk/turk_asdl.txt').read())
    transition_system = TurkTransitionSystem(grammar)

    train_set = load_dataset("train", transition_system)
    dev_set = load_dataset("val", transition_system)
    test_set = load_dataset("test", transition_system)
    # get vocab from actions
    vocab = build_dataset_vocab(train_set, transition_system, src_cutoff=2)

    # cache decision using vocab can be done in train
    pickle.dump(train_set, open('data/turk/train.bin', 'wb'))
    pickle.dump(dev_set, open('data/turk/dev.bin', 'wb'))
    pickle.dump(test_set, open('data/turk/test.bin', 'wb'))
    pickle.dump(vocab, open('data/turk/vocab.bin', 'wb'))
Beispiel #15
0
	def __init__(self, filename):
		'''
		Initialise the grammar to be used to parse each line in the log file
		Setup multiprocessing shared object and other generic structs
		'''
		#Function dictionary, something to mimic function pointers
		self.parse_line = {
			'octeon':self.parse_line_octeon,
			'others':self.parse_line_other
		}

		self.log = Logger("parser","info") #???? Not the nest implementation yet
		#Initialise variables
		self.__filepath = filename
		#Setup the grammar to use based on the filetype
		self.__grammar_type = self.get_grammar_type(filename)
		self.__current_grammar = Grammar(self.__grammar_type).grammar

		self.parse_file()
Beispiel #16
0
def from_xml(xml_path) -> Grammar:
    """
    Parses xml file describing the language grammar.
    Creates a Grammar object and returns it.

    :param xml_path: source xml-file
    :return: Grammar object
    """

    terminals = {''}
    non_terminals = set()
    starting_non_terminal = None
    transitions = {}
    restrictions = {}

    restrictions_names = {'max-word-length'}

    def _parse_terminals(terminals_xml_element):
        for terminal in terminals_xml_element:
            terminals.add(terminal.attrib['value'])

    def _parse_non_terminals(non_terminals_xml_element):
        nonlocal starting_non_terminal

        for non_terminal in non_terminals_xml_element:
            if 'starting' in non_terminal.attrib and (
                    non_terminal.attrib['starting'] == 'true'):

                starting_non_terminal = non_terminal.attrib['value']

            non_terminals.add(non_terminal.attrib['value'])

    def _parse_transitions(transitions_xml_element):
        for transition in transitions_xml_element:
            from_sym = transition.attrib['from']
            to_seq = transition.attrib['to']

            if from_sym not in transitions:
                transitions[from_sym] = []

            transitions[from_sym].append(to_seq)

    def _parse_restrictions(restrictions_xml_element):
        for restriction in restrictions_xml_element:
            if restriction.tag in restrictions_names:
                restrictions[restriction.tag] = restriction.attrib['value']

    tree = ElementTree.parse(xml_path)
    root = tree.getroot()

    actions = {
        'terminals': _parse_terminals,
        'non-terminals': _parse_non_terminals,
        'transitions': _parse_transitions,
        'restrictions': _parse_restrictions,
    }

    for child in root:
        parse_action = actions.get(child.tag, None)

        if parse_action is not None:
            parse_action(child)

    if 'name' in root.attrib and root.attrib['name'] != '':
        grammar_name = root.attrib['name']
    else:
        grammar_name = 'unknown'

    return Grammar(terminals=terminals,
                   non_terminals=non_terminals,
                   starting_non_terminal=starting_non_terminal,
                   transitions=transitions,
                   restrictions=restrictions,
                   name=grammar_name)
Beispiel #17
0
class Regex(object):
    def __init__(self, regex, ignore_white_space=True):
        self.regex = regex
        self.G = Grammar()
        E = self.G.NonTerminal('E', True)
        T, F, A, X, Y, Z = self.G.NonTerminals('T F A X Y Z')
        pipe, star, opar, cpar, symbol, epsilon, plus, minus, obrack, cbrack, question = self.G.Terminals(
            '| * ( ) symbol ε + - [ ] ?')

        E %= T + X, lambda h, s: s[2], None, lambda h, s: s[1]
        X %= pipe + E, lambda h, s: UnionNode(h[0], s[2])
        X %= self.G.Epsilon, lambda h, s: h[0]
        T %= F + Y, lambda h, s: s[2], None, lambda h, s: s[1]
        Y %= T, lambda h, s: ConcatNode(h[0], s[1])
        Y %= self.G.Epsilon, lambda h, s: h[0]
        F %= A + Z, lambda h, s: s[2], None, lambda h, s: s[1]
        Z %= star, lambda h, s: ClosureNode(h[0])
        Z %= plus, lambda h, s: PositiveClosureNode(h[0])
        Z %= question, lambda h, s: QuestionNode(h[0])
        Z %= self.G.Epsilon, lambda h, s: h[0]
        A %= symbol, lambda h, s: SymbolNode(s[1])
        A %= epsilon, lambda h, s: EpsilonNode(s[1])
        A %= opar + E + cpar, lambda h, s: s[2]

        self.automaton = self._build_automaton(regex, ignore_white_space)

    def _build_automaton(self, regex, ignore_white_space):
        def regex_tokenizer(regex, ignore_white_space):
            d = {term.Name: term for term in self.G.terminals}
            tokens = []
            symbol_term = [
                term for term in self.G.terminals if term.Name == 'symbol'
            ][0]
            fixed_tokens = {
                tok.Name: Token(tok.Name, tok)
                for tok in [
                    d['|'], d['*'], d['+'], d['?'], d['('], d[')'], d['['],
                    d[']'], d['-'], d['ε']
                ]
            }

            for i, c in enumerate(regex):
                if c == '@' or (ignore_white_space and c.isspace()):
                    continue
                try:
                    token = fixed_tokens[c]
                    if regex[i - 1] == '@':
                        raise KeyError
                except KeyError:
                    token = Token(c, symbol_term)
                tokens.append(token)
            tokens.append(Token('$', self.G.EOF))
            return tokens

        toks = regex_tokenizer(regex, ignore_white_space)
        parser = build_ll1_parser(self.G)
        left_parse = parser(toks)
        tree = evaluate_parse(left_parse, toks)
        automatom = tree.evaluate()
        automaton = nfa_to_deterministic(automatom)
        return automaton

    def __call__(self, w: str):
        return self.automaton.recognize(w)
class SymbolicClustering(object):
    def __init__(self,
                 segmentLength=20,
                 paaSize=5,
                 alphabetSize=3,
                 upperBound=100,
                 lowerBound=-100):
        self.segmentLength = segmentLength
        self.paaSize = paaSize
        self.alphabetSize = alphabetSize
        self.upperBound = upperBound
        self.lowerBound = lowerBound
        self.sax = SAX(wordSize=paaSize,
                       alphabetSize=alphabetSize,
                       lowerBound=lowerBound,
                       upperBound=upperBound,
                       epsilon=1e-6)
        self.grammar = Grammar()
        self.segmentIndexes = []
        self.rule_set = []
        self.tsCount = 0

    def printSegments(self):
        print("\nCurrent Segments:")
        for segmentIndex in self.segmentIndexes:
            segmentIndex.printContent()

    def discretize(self, s):
        """
        @description  : discretize the single seires using modified PAA method.
        ---------
        @param  : s -- timeseries in array format
        -------
        @Returns  : a list of segments which are discretized from the input.
        -------
        """

        n = len(s)
        segments = []
        if n % self.segmentLength != 0:
            raise SegmentsCanNotBeEquallyDivided()
        nSegment = int(n / self.segmentLength)
        for i in range(0, nSegment):
            start = i * self.segmentLength
            end = (i + 1) * self.segmentLength
            if self.tsCount == 0:
                self.segmentIndexes.append(SegmentIndex((start, end)))
            (letters, indices) = self.sax.to_letter_rep_ori(s[start:end])
            segment = Segment(s[start:end], letters, indices,
                              self.segmentIndexes[i])
            self.segmentIndexes[i].addSegment(segment)
            segments.append(segment)
        self.tsCount += 1
        return segments

    def grammar_induction(self, segments):
        """
        @description  : get grammar from segments.
        ---------
        @param  : segments -- a list of segments
        -------
        @Returns  :
        -------
        """
        self.grammar.train_string(segments)
        self.rule_set = self.grammar.get_rule_set()

    def get_frequency_matrix(self):
        """
        @description  : for each segment, generate their frequencies of which are covered by the same grammar rule.
        ---------
        @param  :
        -------
        @Returns  : a two-dimensional matrix that represents the frequency of each segment covered by certain grammar.

        -------
        """

        frequencyMatrix = []
        for segmentIndex in self.segmentIndexes:
            rDict = {}
            for j in range(0, self.tsCount):
                segment = segmentIndex.getSegment(j)
                rule = segment.getRule()
                if rDict.get(rule) == None:
                    rDict[rule] = 1
                else:
                    rDict[rule] = rDict[rule] + 1

            rowFrequency = []
            for j in range(0, self.tsCount):
                rule = segmentIndex.getSegment(j).getRule()
                if rule == self.grammar.root_production:
                    rowFrequency.append(1)
                else:
                    rowFrequency.append(
                        rDict[segmentIndex.getSegment(j).getRule()])

            frequencyMatrix.append(rowFrequency)

        return frequencyMatrix

    def cut_window(self, frequencyMatrix):
        """
        @description  : generate windows with the frequencyMatrix. The change points of the frequency are the cut lines.
        ---------
        @param  : frequencyMatrix -- a two-dimensional matrix
        -------
        @Returns  : a list of windows
        -------
        """

        start = 0
        windows = []
        for now in range(1, len(frequencyMatrix)):
            if frequencyMatrix[now] != frequencyMatrix[start]:
                windows.append(Window(start, now, self.segmentLength))
                start = now
        windows.append(Window(start, len(frequencyMatrix), self.segmentLength))
        return windows

    def generateInitialClusters(self, startIndex, windows):
        """
        @description  : generate initial clusters in each window. The clusters are not 
        overlapped by each other but the sum of them covers all the segments. 
        ---------
        @param  : startIndex -- the start number of p time series.
                  windows  --  the cut window used to generate clusters
        -------
        @Returns  : new windows that each of which contains the generated clusters
        -------
        """

        for window in windows:
            window.initSubsequences(startIndex, self)
            window.initClusters(self)
            window.clustersCombination()
            window.clustersBreakingTie()
            window.clustersProcessMiss()
            window.computeAllDistancesAndCentroids()
        return windows
Beispiel #19
0
def build_cool_grammar():
    G = Grammar()
    program = G.NonTerminal('<program>', True)
    class_list, class_def, empty_feature_list, feature_list, meod_def = G.NonTerminals(
        '<class_list> <class_def> <empty_feature_list> <feature_list> <meod_def>'
    )
    attr_def, param_list, param, statement_list = G.NonTerminals(
        '<attr_def> <param_list> <param> <statement_list>')
    statement, var_dec, func_call, args_list = G.NonTerminals(
        '<statement> <var_dec> <func_call> <args_list>')
    exp, typex, term, factor = G.NonTerminals('<exp> <type> <term> <factor>')
    arith, atom = G.NonTerminals('<arith> <atom>')
    args_list_empty, param_list_empty = G.NonTerminals(
        '<args_list_empty> <param_list_empty>')

    class_keyword, def_keyword, in_keyword = G.Terminals('class def in')
    coma, period, dot_comma, opar, cpar, obrack, cbrack, plus, minus, star, div, dd = G.Terminals(
        ', . ; ( ) { } + - * / :')
    idx, let, intx, string, num, equal, true, false, boolean, objectx = G.Terminals(
        'id let int string num = true false bool object')
    string_const, void, auto = G.Terminals('string_const void AUTO_TYPE')
    if_, then, else_, assign, new = G.Terminals('if then else assign new')
    gt, lt, ge, le, eq, not_ = G.Terminals('> < >= <= == !')
    while_, do = G.Terminals('while do')

    program %= class_list, lambda s: ProgramNode(s[1])

    class_list %= class_def, lambda s: [s[1]]
    class_list %= class_def + class_list, lambda s: [s[1]] + s[2]

    class_def %= class_keyword + idx + obrack + feature_list + cbrack, lambda s: ClassDef(
        s[2], s[4])
    class_def %= class_keyword + idx + dd + typex + obrack + feature_list + cbrack, lambda s: ClassDef(
        s[2], s[6], s[4])

    feature_list %= meod_def, lambda s: [s[1]]
    feature_list %= attr_def, lambda s: [s[1]]
    feature_list %= meod_def + feature_list, lambda s: [s[1]] + s[2]
    feature_list %= attr_def + feature_list, lambda s: [s[1]] + s[2]

    meod_def %= def_keyword + idx + opar + param_list_empty + cpar + dd + typex + obrack + statement_list + cbrack, lambda s: MethodDef(
        s[2], s[4], s[7], s[9])

    attr_def %= idx + dd + typex + dot_comma, lambda s: AttributeDef(
        s[1], s[3])
    attr_def %= idx + dd + typex + equal + exp + dot_comma, lambda s: AttributeDef(
        s[1], s[3], s[5])

    param_list_empty %= param_list, lambda s: s[1]
    param_list_empty %= G.Epsilon, lambda s: []
    param_list %= param, lambda s: [s[1]]
    param_list %= param + coma + param_list, lambda s: [s[1]] + s[3]

    param %= idx + dd + typex, lambda s: Param(s[1], s[3])

    statement_list %= exp + dot_comma, lambda s: [s[1]]
    statement_list %= exp + dot_comma + statement_list, lambda s: [s[1]] + s[3]

    # var_dec %= let + idx + dd + typex + equal + exp, lambda s: VariableDeclaration(s[2],s[4],s[6])
    var_dec %= let + idx + dd + typex + assign + exp + in_keyword + obrack + statement_list + cbrack, lambda s: VariableDeclaration(
        s[2], s[4], s[6], s[9])

    exp %= arith, lambda s: s[1]
    arith %= arith + plus + term, lambda s: PlusNode(s[1], s[3])
    arith %= arith + minus + term, lambda s: DifNode(s[1], s[3])
    arith %= term, lambda s: s[1]
    term %= term + star + factor, lambda s: MulNode(s[1], s[3])
    term %= term + div + factor, lambda s: DivNode(s[1], s[3])
    term %= factor, lambda s: s[1]
    factor %= opar + arith + cpar, lambda s: s[2]
    factor %= num, lambda s: IntegerConstant(s[1])
    factor %= idx, lambda s: VariableCall(s[1])
    factor %= idx + period + idx + opar + args_list_empty + cpar, lambda s: FunCall(
        s[1], s[3], s[5])
    factor %= idx + opar + args_list_empty + cpar, lambda s: FunCall(
        'self', s[1], s[3])
    exp %= var_dec, lambda s: s[1]
    exp %= true, lambda s: TrueConstant()
    exp %= false, lambda s: FalseConstant()
    exp %= string_const, lambda s: StringConstant(s[1])
    exp %= if_ + opar + exp + cpar + then + obrack + exp + cbrack + else_ + obrack + exp + cbrack, lambda s: IfThenElseNode(
        s[3], s[7], s[11])
    exp %= idx + assign + exp, lambda s: AssignNode(s[1], s[3])
    exp %= atom, lambda s: s[1]
    exp %= new + idx + opar + args_list_empty + cpar, lambda s: InstantiateClassNode(
        s[2], s[4])
    atom %= factor + gt + factor, lambda s: GreaterThanNode(s[1], s[3])
    atom %= factor + lt + factor, lambda s: LowerThanNode(s[1], s[3])
    atom %= factor + eq + factor, lambda s: EqualToNode(s[1], s[3])
    atom %= factor + ge + factor, lambda s: GreaterEqualNode(s[1], s[3])
    atom %= factor + le + factor, lambda s: LowerEqual(s[1], s[3])
    atom %= not_ + factor, lambda s: NotNode(s[2])

    exp %= while_ + opar + exp + cpar + do + obrack + statement_list + cbrack, lambda s: WhileBlockNode(
        s[3], s[7])

    typex %= intx, lambda s: 'int'
    typex %= boolean, lambda s: 'bool'
    typex %= string, lambda s: 'string'
    typex %= objectx, lambda s: 'object'
    typex %= idx, lambda s: s[1]
    typex %= auto, lambda s: 'AUTO_TYPE'
    typex %= void, lambda s: 'void'

    args_list_empty %= args_list, lambda s: s[1]
    args_list_empty %= G.Epsilon, lambda s: []
    args_list %= exp, lambda s: [s[1]]
    args_list %= exp + coma + args_list, lambda s: [s[1]] + s[3]

    table = [
        (class_keyword, 'class'), (def_keyword, 'def'), (in_keyword, 'in'),
        (intx, 'int'), (boolean, 'bool'), (objectx, 'object'),
        (string, 'string'), (true, 'true'), (false, 'false'),
        (auto, 'AUTO_TYPE'), (if_, 'if'), (then, 'then'), (else_, 'else'),
        (new, 'new'),
        (while_, 'while'), (do, 'do'), (coma, ','), (period, '.'), (dd, ':'),
        (dot_comma, ';'), (assign, '<@-'), (lt, '@<'), (gt, '@>'), (ge, '>='),
        (le, '<='), (eq, '=='), (not_, '@!'), (equal, '='), (opar, '@('),
        (cpar, '@)'), (obrack, '@{'), (cbrack, '@}'), (plus, '@+'),
        (minus, '@-'), (div, '/'), (star, '@*'), (let, 'let'),
        (idx,
         '(A|a|B|b|C|c|D|d|E|e|F|f|G|g|H|h|I|i|J|j|K|k|L|l|M|m|N|n|O|o|P|p|Q|q|R|r|S|s|T|t|u|U|V|v|W|w|X|x|Y|y|Z|z)+'
         ), (num, '0|(1|2|3|4|5|6|7|8|9)(1|2|3|4|5|6|7|8|9|0)*'),
        (string_const,
         "@'(A|a|B|b|C|c|D|d|E|e|F|f|G|g|H|h|I|i|J|j|K|k|L|l|M|m|N|n|O|o|P|p|Q|q|R|r|S|s|T|t|u|U|V|v|W|w|X|x|Y|y|Z|z)+@'"
         )
    ]

    lexer = Lexer(table, G.EOF, ignore_white_space=True)
    return G, lexer
Beispiel #20
0
class Parser(object):
	def __init__(self, filename):
		'''
		Initialise the grammar to be used to parse each line in the log file
		Setup multiprocessing shared object and other generic structs
		'''
		#Function dictionary, something to mimic function pointers
		self.parse_line = {
			'octeon':self.parse_line_octeon,
			'others':self.parse_line_other
		}

		self.log = Logger("parser","info") #???? Not the nest implementation yet
		#Initialise variables
		self.__filepath = filename
		#Setup the grammar to use based on the filetype
		self.__grammar_type = self.get_grammar_type(filename)
		self.__current_grammar = Grammar(self.__grammar_type).grammar

		self.parse_file()

	def parse_file(self):
		'''
		Read a file and distribute chunks to parsing routine.
		Collect chunks and send for generating the final parsed
		list.
		'''
		self.__manager = mp.Manager()
		self.__out_list = self.__manager.list()
		self.__dict_list,self.__lines, self.__pcap_text = [],[],[]
		self.__last_ppm_timestamp = [[(-1,0)]*MAX_CORES]*MAX_OCTS
		with open(self.__filepath,'r') as f:
			self.__lines = f.readlines()
			self.__num_lines = len(self.__lines)
			self.__chunk_size = self.__num_lines/mp.cpu_count()
			self.__chunk_size = self.__num_lines \
				if self.__chunk_size < self.__num_lines \
				else self.__chunk_size
			#Distribute work to processes in chunks
			processes = [mp.Process(target=self.parse_line[self.__grammar_type], args=(index,))\
				for index in range(0,len(self.__lines),self.__chunk_size)]
			[process.start() for process in processes]
			[process.join() for process in processes]
		#Cleanup parsed object list
		self.generate_parsed_list()

	def parse_line_octeon(self, index):
		'''
		Parse a chunk of lines from original file to create
		a list of defaultdict objects, which is appended to 
		a multiprocessing Queue as a tuple (starting index,list(defaultdict))
		'''
		temp_list = []
		for offset,line in enumerate(self.__lines[index:]):
			lineno = index + offset
			self.log.logger.debug("[File:%s]Parsing:[line#%d]%s" \
				% (self.__filepath,lineno,line.strip()))
			try:
				result = self.__current_grammar.parseString(line)
			except ParseException:
				self.log.logger.error("[File:%s]Unrecognized format:[line#%d]%s" \
					% (self.__filepath,lineno,line.strip()))
				return
			
			if not result.core:
				self.log.logger.error("[File:%s]Expected [core#] field:[line#%d]%s" \
					% (self.__filepath,lineno,line.strip()))
				return

			if not (result.linux or result.vxworks):
				self.log.logger.error("[File:%s]Not a linux or vxworks format:[line#%d]%s" \
					% (self.__filepath,lineno,line.strip()))
				return

			dic = defaultdict(int)
			dic["systime"] = datetime.strptime(" ".join(result.systime),"%b %d %H:%M:%S.%f")			
			if result.timestamp or result.ppm:
				if result.linux:
					dic['oct'], dic['core'], dic['ppm'], dic['timestamp'] = \
						result.octeon, result.core, result.ppm, result.timestamp
				else:
					dic['oct'] = 0
					dic['core'], dic['ppm'], dic['timestamp'] = \
						result.core, result.ppm, result.timestamp
			else:
				if result.linux:
					dic['oct'], dic['core'] = result.octeon,result.core
				else:
					dic['oct'] = 0
					dic['core'] = result.core
				dic['ppm'] = -1
				dic['timestamp'] = -1
			dic['p_pcap'] = "".join(result.pcap)
			dic['p_line'] = lineno
			temp_list.append(dict(dic))
		self.__out_list.append((index,temp_list))

	def parse_line_other(self, index):
		'''
		Parse a chunk of lines from original file to create
		a list of defaultdict objects, which is appended to 
		a multiprocessing Queue as a tuple (starting index,list(defaultdict))
		'''
		temp_list = []
		for offset,line in enumerate(self.__lines[index:]):
			lineno = index + offset
			self.log.logger.debug("[File:%s]Parsing:[line#%d]%s" \
				% (self.__filepath,lineno,line.strip()))
			try:
				result = self.__current_grammar.parseString(line)
			except ParseException:
				self.log.logger.error("[File:%s]Unrecognized format:[line#%d]%s" \
					% (self.__filepath,lineno,line.strip()))
				return
			
			dic = defaultdict(int)
			dic['systime'] = datetime.strptime(" ".join(result.systime),"%b %d %H:%M:%S.%f")			
			dic['p_line'] = lineno
			temp_list.append(dict(dic))
		self.__out_list.append((index,temp_list))

	def generate_parsed_list(self):
		'''
		Sort based on index from return tuple list and flatten the list of dicts
		Update the timestamp and ppm field of the dicts by using last known ts and ppm
		for given oct/core values.
		'''
		self.__dict_list = sorted(self.__out_list,key=lambda x: x[0])
		self.__dict_list = [item for tup in self.__dict_list for item in tup[1]]
		for item in iter(self.__dict_list):
			item['systime'] = item['systime'].replace(year=datetime.now().year)
			if self.__grammar_type == "octeon":
				if item['timestamp'] > 0 or item['ppm'] > 0:
					self.__last_ppm_timestamp[item['oct']][item['core']] = (item['ppm'],item['timestamp'])
				else:
					item['ppm'],item['timestamp'] = self.__last_ppm_timestamp[item['oct']][item['core']]
				self.__pcap_text.append(item['p_pcap'])

	def get_grammar_type(self,filename):
		if set("log.octData").issubset(set(self.__filepath)):
			return "octeon"
		else:
			return "others"

	def get_pcap_text(self):
		return self.__pcap_text

	def get_filename(self):
		'''Return the file from which the object was created'''
		return self.__filepath

	def get_lines(self):
		'''Return a list of lines from the original file'''
		return self.__lines

	def get_len(self):
		'''Return length of list of lines from the original file'''
		return self.__num_lines

	def get_dict_list(self):
		'''
		Return a list of defaultdict objects created on parsing
		lines from the original file
		''' 
		return self.__dict_list

	def get_sortable_attributes(self):
		'''Return a list of current keys in the dict list'''
		attributes = [key for key in self.__dict_list[0].keys() if not str(key).startswith('p_')]
		return attributes

	def print_sorted_list(self,attr):
		for item in sorted(self.__dict_list, key=operator.itemgetter(attr)):
			print self.__lines[item["p_line"]].strip()
    "VP += clitic:Pron[Pronoun_Form=weak!]",

    # "VP[head=verb] -> verb:Vb",
    "VP += subj:NP[nr=@ pers=@ cas=Nom pos=0? has_det=T]",
    "VP += dobj:NP[cas=Acc pos=1?]",
    "VP += iobj:NP[cas==Dat!]",
    "VP += pcomp*:PP",
    "VP += adv*:Adv",

    # "NP[nr~pl] -> n1:NP[cas=@ pos=0!], cc:C[pos=1!] , n2:NP[cas=@ pos=2!]",
    # "VP -> v1:VP[pos=0!], cc:C[pos=1!] , v2:VP[pos=2!]",
    "SP[head=vp] -> part:Part[Type=Subj! pos=0!] , vp:VP[mod=Subj]",
    "VP += dobj:SP"
]

grammar = Grammar([rule_from_string(s) for s in rules_str])

infilename = 'easy.txt.conllu'  #'C:\\Users\\ffxvtj\\OneDrive\\Lingv\\Corpus\\ro_rrt-ud-train.conllu'
print("Loading file " + infilename)
conll: Conll = pyconll.load_from_file(infilename)
print("Done loading file")


class ProcessConll:
    def __init__(self, conll, grammar):
        self.conll = conll
        self.i = 0
        self.parses = []
        self.grammar = grammar

    def next(self):
Beispiel #22
0
 def test_grammar_get_encoding_length(self):
     lexicon = Lexicon(['abb', 'bba'], self.feature_table)
     grammar = Grammar(self.feature_table, self.constraint_set, lexicon)
     self.assertEqual(grammar.get_encoding_length(), 154)