Ejemplo n.º 1
0
def convert2_nltk_CFG(G):
    terminals, NTs, P, S = G
    Prod = copy(P)
    # this is here to ensure full coverage of terminals
    # when parsing the grammar for testing
    Prod["DUMMY"] = [list(map(lambda x: (x, ), terminals))]
    assert len(S) > 0  # need a start symbol
    if len(S) > 1:
        if "NT0" not in Prod.keys():
            Prod["NT0"] = []
        for Si in S:
            Prod["NT0"].append([(Si, )])
    assert "NT0" in S
    start = nltk.Nonterminal("NT0")
    nltk_nts = nltk.nonterminals(" ".join(list(NTs)))
    productions = []
    # only look at nonterminals with productions
    for NT in Prod.keys():
        for rule in Prod[NT]:
            rhs = rule_to_tuple(rule, NTs)
            #print("convert", NT, rhs)
            prod = nltk.Production(nltk.Nonterminal(NT), rhs)
            productions.append(prod)
    # production is empty here...
    return nltk.grammar.CFG(start, productions)
Ejemplo n.º 2
0
    def __init__(self, pcfg_filename):
        
        start = None
        with open(pcfg_filename, "r") as f:
            lines = f.readlines()
        productions = []
        for line in lines:
            matches = re.match("(\S+)\s*->\s*(\S+)(\s+\S+)?\s+\[([0-9.]+)\]?", line)
            groups = matches.groups()
            group_count = len(groups)
            assert group_count == 4
            lhs = nltk.Nonterminal(groups[0].strip())
            if groups[2] is None:
                production = nltk.Production(lhs, [ groups[1].strip('\'') ])
            else:
                production = nltk.Production(lhs, [ nltk.Nonterminal(groups[1].strip()), nltk.Nonterminal(groups[2].strip()) ])
            probability = float(groups[3].strip())
            
            # Read the Production rule:
            if (start is None):
                start = lhs

            productions.append(production)
            self.probability_of_production[production] = probability
        self.grammar = nltk.grammar.CFG(start, productions, False)
Ejemplo n.º 3
0
    def __count_productions_recursively(self,
                                        node: nltk.Tree) -> nltk.Nonterminal:
        """Recursively parses a tree representation of a sentence."""
        label = node.label()
        # Traverse the tree:
        if (len(node) == 2):
            # Handle non-leaf nodes:
            left = self.__count_productions_recursively(node[0])
            right = self.__count_productions_recursively(node[1])
            production = nltk.Production(
                nltk.Nonterminal(label),
                [nltk.Nonterminal(left.lhs()),
                 nltk.Nonterminal(right.lhs())])
        else:
            # Handle leaf node.
            token = node[0]
            self.token_count += 1
            if (token not in self.count_per_token):
                self.count_per_token[token] = 1
            else:
                self.count_per_token[token] += 1
            production = nltk.Production(nltk.Nonterminal(label), [token])

        # Update our count of this particular productions.
        if (production not in self.count_per_production):
            self.count_per_production[production] = 1
        else:
            self.count_per_production[production] += 1
        # Update our count of all productions with a particular LHS.
        lhs = production.lhs()
        if (lhs not in self.lhs_count):
            self.lhs_count[lhs] = 1
        else:
            self.lhs_count[lhs] += 1
        return production
Ejemplo n.º 4
0
    def __init__(self, pcfg_filename):
        
        start = None
        # Read the file:
        with open(pcfg_filename, "r") as f:
            lines = f.readlines()
        # Parse the file's contents:
        productions = []
        for line in lines:
            matches = re.match("(\S+)\s*->\s*(\S+)(\s+\S+)?\s+\[([0-9.]+)\]?", line)
            groups = matches.groups()
            group_count = len(groups)
            assert group_count == 4
            lhs = nltk.Nonterminal(groups[0].strip())
            if groups[2] is None:
                production = nltk.Production(lhs, [ groups[1].strip('\'') ])
            else:
                production = nltk.Production(lhs, [ nltk.Nonterminal(groups[1].strip()), nltk.Nonterminal(groups[2].strip()) ])
            log_probability = math.log(float(groups[3].strip()))
            
            # Read the Production rule:
            if (start is None):
                start = lhs

            productions.append(production)
            self.log_probability_of_production[production] = log_probability
            if self.min_log_probability is None or math.fabs(log_probability) > math.fabs(self.min_log_probability):
                self.min_log_probability = log_probability
        self.grammar = nltk.grammar.CFG(start, productions, False)
        # Make it much less probable than the actual minimum_log_probability but still non-zero.
        self.min_log_probability = self.min_log_probability / 2
Ejemplo n.º 5
0
    def induce_structure(self, sentences):

        sentences = [[c for c in s] for s in sentences]

        start_symbols = set()
        productions = []
        prod_table = {}

        # group all digits together
        digit_terminals = set([str(i) for i in range(10)])

        # unary rules
        terminals = set()
        for s in sentences:
            terminals.update(s)
        for t in terminals:
            if t in digit_terminals:
                nt = nltk.Nonterminal("Digit")
            else:
                nt = nltk.Nonterminal("Unary%s" % self.gen_nt())
            p = Production(nt, [t])
            productions.append(p)
            prod_table[tuple(p.rhs())] = p.lhs()

        sentences = self.apply_unary_prod(sentences, prod_table)

        while len(sentences) > 0:
            if self.has_recursion(sentences):
                p = self.generate_recursive_prod(sentences)
            else:
                p = self.generate_most_frequent_prod(sentences)

            productions.append(p)
            prod_table[tuple(p.rhs())] = p.lhs()

            sentences = self.update_with_prod(sentences, prod_table)

            new_sentences = []
            for s in sentences:
                if len(s) == 1:
                    start_symbols.add(s[0])
                else:
                    new_sentences.append(s)

            sentences = new_sentences

        # generate the start productions
        for symbol in start_symbols:
            for p in productions:
                if p.lhs() == symbol:
                    productions.append(Production(self.start, p.rhs()))

        self.grammar = nltk.induce_pcfg(self.start, productions)
Ejemplo n.º 6
0
def pcfg_reverse(word):
    s = build_tree(word, 0)
    tree = nltk.Tree.fromstring(s)
    productions = tree.productions()
    for p in productions:
        
        ##################################################
        # !!! THIS IS WHERE THE MAGIC HAPPENS !!!        #
        if len(p._rhs) > 1:                              #
            p._rhs = (p._rhs[1], p._rhs[0])              #
            ##############################################
            
    grammar = nltk.induce_pcfg(nltk.Nonterminal("N0"), productions)
#     print(grammar)     # UNCOMMENT FOR A FUN TIME!
    parser = nltk.pchart.InsideChartParser(grammar)
    
    # Shuffle to generate 1000 possible words; only the correct
    # solution will be parseable with our grammar!
    for i in range(1000):
        cand = random.sample(word, len(word))
#         print(cand)               # UNCOMMENT FOR A FUN TIME!
        parser.parse(cand)
        for parse in parser.parse(cand):
            if parse._ProbabilisticMixIn__prob > 0:
#                 print("number of tries: {}".format(i))  # UNCOMMENT!
                return "".join(cand)
    return "no reverse found, try again"
Ejemplo n.º 7
0
def expand_all(grammar, nonterm, state):
     result = ""
     queue = Queue.LifoQueue()
     queue.put_nowait(nonterm)
     # do this iteratively; recursively blows past python's recursive limit
     in_list = None
     len_at_start_of_list = 0
     while not queue.empty():
          head = queue.get_nowait()
          # Keep track of being in a list until all the bits for the list
          # have been used up
          if head in state.list_bits:
               in_list = head
               len_at_start_of_list = queue.qsize()
          # done with the list once we consume the next item in the queue
          if in_list and queue.qsize() < len_at_start_of_list:
               in_list = None
          terms = expand(grammar, head, state, in_list)
          if len(terms) == 0:
               if isinstance(head, basestring):
                    result = " ".join([result, head])
               else:
                    result = " ".join([result, str(head)])
          else :
               # put them into the lifo queue backwards, so we'll get the
               # first one out
               for nt in reversed(terms):
                    if nt in state.common.append_newlines():
                         queue.put_nowait(nltk.Nonterminal("\n"))
                    queue.put_nowait(nt)
     return result
Ejemplo n.º 8
0
    def fit_pcfg(self, X):

        if self.fitted_pcfg:
            raise ValueError("PCFG.pcfg already fitted")

        productions = []
        for sentence in X:
            # nltk format
            t = nltk.tree.Tree.fromstring(sentence,
                                          remove_empty_top_bracketing=True)
            # chomky normal form
            self.chomkysation(t)
            #rules exraction
            rules = self.extract_rules(t, lexical=False)
            productions.extend(rules)

        start = nltk.Nonterminal('SENT')
        self.pcfg_ = nltk.induce_pcfg(start, productions)
        self.pcfg_.chomsky_normal_form(flexible=False)

        #get tokens
        for prod in self.pcfg_._productions:
            for token in prod._rhs:
                if not token == 'SENT':
                    self.non_terminals.append(token)
        self.non_terminals.insert(0, start)

        #get tokens2index
        self.pos2index = {}
        for i, token in enumerate(self.non_terminals):
            self.pos2index[token] = i

        self.fitted_pcfg = True
Ejemplo n.º 9
0
    def convert_hybrid_productions(self, production):
        """
        Convert a hybrid production into valid CNF by creating new non-terminals for any terminals in the production
        :param production: a hybrid production
        :return:
        """
        terminal_list = []
        new_rhs = []

        for node in production.rhs():
            if nltk.grammar.is_nonterminal(node):
                new_rhs.append(node)
            else:
                terminal_list.append(node)
                # replace each terminal with a new non-terminal
                new_rhs.append(nltk.Nonterminal(node.upper()))

        if self.is_long(new_rhs):
            self.convert_long_productions(production.lhs().symbol(), new_rhs)
        else:
            self.add_production(production.lhs().symbol(),
                                self.production_string(new_rhs))

        # add new productions for each new non-terminal created
        for string in terminal_list:
            self.add_production(string.upper(),
                                self.production_string([string]))
    def build(self, rhs):
        newKey = 'X' + str(self.seed)

        self.seed = self.seed + 1

        lhs = nltk.Nonterminal(newKey)

        return self.buildNormal(lhs, rhs)
Ejemplo n.º 11
0
 def _train_rules_grammar(self):
     print("training grammar")
     self._grammar = nltk.induce_pcfg(
         nltk.Nonterminal('TOP'),
         reduce(lambda a, b: a + b,
                map(lambda t: t.productions(), self._treebank)))
     if RUN_MODE == PURE_CKY_M and UNKOWN_MODE:
         add_unknowns(self._grammar)
     print("finished grammar training")
Ejemplo n.º 12
0
def create_rule_with_RHS(rhs):
    global counter
    newKey = 'X' + str(counter)
    counter += 1
    lhs = nltk.Nonterminal(newKey)
    if isinstance(rhs, str):
        rhs = [rhs]

    return create_rule(lhs, rhs)
Ejemplo n.º 13
0
 def reset_to_previous_finished_query(self):
     end = None
     for i, production in reversed(list(enumerate(self.actions))):
         if production.lhs() == nltk.Nonterminal('_Q_'):
             end = i
             break
     productions = self.actions[:end]
     self.reset()
     for production in productions:
         self.apply_production(production)
Ejemplo n.º 14
0
def rule_to_tuple(rule, nonterminals):
    rhs = []
    for token in rule:
        assert type(token) == tuple
        assert len(token) == 1
        symbol = token[0]
        if symbol in nonterminals:
            rhs.append(nltk.Nonterminal(symbol))
        else:
            rhs.append(symbol)
    return tuple(rhs)
Ejemplo n.º 15
0
def choose(nonterm, in_list, prods, state):
     if nonterm in state.last_or_nots:
          # if True, use the last one, otherwise, use anything but.
          # this choice uses no bits
          if state.last_or_nots[nonterm]:
               return prods[len(prods)-1]
          else:
               return random.choice(prods[:-1])
     elif state.bitstring.index >= len(state.bitstring.bitstring):
          # We're past the end of the message, so just pick randomly
          return random.choice(prods)
     elif len(prods) < 3:
          return prods[0]

     bits = int(math.log(len(prods)-1, 2))
     prevPow2 = math.pow(2, bits)

     # For lists, only pick the end of the list once we've used all
     # the bits, or we're out of bits.  Unless we're the last list
     # left, then keep going until we consume all bits
     end_list = False
     if in_list and in_list in state.list_bits:
          bits_left = state.list_bits[in_list]
          if (bits_left <= 0 and len(state.list_bits) > 1 and
              nonterm in state.common.list_recursive_terms()):
               end_list = True
               del state.list_bits[in_list]
          else:
               state.list_bits[in_list] = bits_left - bits
               #print ("Consuming %s bits for list %s (%s left)" %
               #       (bits, in_list, state.list_bits[in_list]))

     # otherwise, use the first 'bits' bits to pick the index
     index = int(prevPow2)
     if not end_list:
          strindex = state.bitstring.bitstring[state.bitstring.index:bits+
                                               state.bitstring.index]
          index = int(strindex, 2)
          #print ("(%s) Using bits %s (%s -> %s)" %
          #       (len(state.bitstring.bitstring)-state.bitstring.index, strindex, nonterm, prods[index]))
          state.bitstring.index += bits
          state.bitstring.index = min(state.bitstring.index,
                                      len(state.bitstring.bitstring))
     # now interpret as an int
     prod = prods[index]
     if len(state.list_bits) == 0 and nonterm == nltk.Nonterminal("CFP_BODY"):
          # set the list bits
          state.list_bits.update(
               state.common.calc_list_bits(len(state.input_text)*8, prod))
     return prod
Ejemplo n.º 16
0
 def createGrammar(self, userMessages, ctx):
     parser = CoreNLPParser(url='http://localhost:9000')
     parse_trees = []
     for message in userMessages:
         tokenized = nltk.sent_tokenize(message)
         for sentence in tokenized:
             parse_trees.append(list(parser.raw_parse(sentence))[0])
     grammar_rules = set()
     for tree in parse_trees:
         for production in tree.productions():
             grammar_rules.add(production)
     start = nltk.Nonterminal('S')
     grammar = nltk.induce_pcfg(start, grammar_rules)
     return (' '.join((self.generate_sentence(grammar))))
Ejemplo n.º 17
0
def pcfg_train(trees, vocab):
    #    Write a function pcfg_train() that takes as its input a collection
    #    of nltk.tree.Tree objects. For example, it might be passed some
    #    portion of nltk.corpus.treebank.parsed_sents(). This function
    #    should return a nltk.PCFG object.

    all_productions = []

    for t in trees:
        for p in t.productions():
            all_productions.append(nltk.Production(p.lhs(), p.rhs()))

    pcfg = nltk.induce_pcfg(nltk.Nonterminal('S'), all_productions)

    return (pcfg)
Ejemplo n.º 18
0
    def parse(self, sent):

        lhs = [pair[0] for pair in self.grammar]
        rhs = [pair[1] for pair in self.grammar]

        stack = []
        index = 0
        shift = True

        while index != len(sent):

            reduce = False
            
            if shift:
                stack.append(sent[index])
                index += 1
            
            tags = [x.label() if not isinstance(x, tuple)\
                    else funcs.ConvertPosTag(x[1]) for x in stack]
            
            for i in range(len(stack)-2, -1, -1):

                if tags[i:] in rhs:

                    shift = False
                    reduce = True
                    parent = lhs[rhs.index(tags[i:])]
                    children = stack[i:]
                    tree = nltk.tree.Tree(parent, children)
                    stack = stack[:i]
                    stack.append(tree)
                    break
            
            if not reduce: shift = True
            

        s = nltk.Nonterminal("S")
        tree = nltk.tree.Tree(s, stack)

        return tree
        
Ejemplo n.º 19
0
def train_pcfg():
    print 'training grammar'
    productions = []
    # print len(treebank.fileids())
    trees = []
    # up to 199 less for shorter grammar for quicker training
    for fileid in treebank.fileids()[0:20]:
        for tree in treebank.parsed_sents(fileid):
            # perform optional tree transformations, e.g.:
            # Remove branches A->B->C into A->B+C so we can avoid infinite
            # productions
            tree.collapse_unary(collapsePOS=False)
            # Remove A->(B,C,D) into A->B,C+D->D (binarization req'd by CKY parser)
            # horizontal and vertical Markovization: remember parents and siblings in tree
            #     This gives a performance boost, but makes the grammar HUGE
            #     If we use these we would need to implement a tag forgetting method
            #tree.chomsky_normal_form(horzMarkov = 0, vertMarkov=0)
            tree.chomsky_normal_form()
            productions += tree.productions()
    S = nltk.Nonterminal('S')
    grammar = nltk.induce_pcfg(S, productions)
    print "grammar trained!"
    return grammar
Ejemplo n.º 20
0
    def buildGrammar(self):
        print('Starting building Grammar...')
        start = time.time()
        # We need to modify https://www.cs.bgu.ac.il/~elhadad/nlp16/NLTK-PCFG.html, cells 20/21
        # So that we only keep terminals which are PoS tags
        productions = []

        # This will be used in the buildLexicon function
        self.tokens = set() # Tokens in the vocabulary
        self.posTags = set() # PoS tags linked to these tokens
        for tree in self.trees:
            for production in tree.productions():
                if not production.is_lexical(): # We only rules which have terminals which are PoS --> we don't keep terminals which are tokens
                    productions.append(production)
                else: # We keep rules which have a token as terminal
                    self.tokens.add(production.rhs()[0])
                    self.posTags.add(production.lhs())
        root = nltk.Nonterminal('SENT')
        grammar = nltk.induce_pcfg(root, productions)

        end = time.time()
        print('... Grammar built. Time: {:2.2}s'.format(end-start))
        return grammar
Ejemplo n.º 21
0
from collections import defaultdict

import nltk
from nltk import ProbabilisticProduction

UNKNOWN_T = nltk.Nonterminal('UNKOWN')


def add_unknowns(grammar):
    terminal_derivation_non_t = set(
        [prod.lhs() for prod in grammar.productions() if prod.is_lexical()])
    terminal_derivation_probs = defaultdict(int)
    for term_prod in terminal_derivation_non_t:
        terminal_derivation_probs[term_prod] = sum(
            prod.prob() for prod in grammar.productions(lhs=term_prod)
            if prod.is_lexical())
        if terminal_derivation_probs[term_prod] > 1:

            terminal_derivation_probs[term_prod] = 1
    total_non_t_rules = defaultdict(lambda: {
        'l': defaultdict(lambda: [0, 0.0]),
        'r': defaultdict(lambda: [0, 0.0])
    })
    for prod in grammar.productions():
        rule = prod.lhs()
        if len(prod.rhs()) > 1:
            l_non_t, r_non_t = prod.rhs()
            prob_l_derives_t = terminal_derivation_probs[l_non_t]
            prob_r_derives_t = terminal_derivation_probs[r_non_t]
            total_non_t_rules[rule]['l'][l_non_t][0] += 1
            total_non_t_rules[rule]['l'][l_non_t][1] += prod.prob(
Ejemplo n.º 22
0
                self._grammar._productions.append(
                    ProbabilisticProduction(Nonterminal(pos), [tok],
                                            prob=0.000001))
        if missing:
            self._grammar._calculate_indexes()

        print(self._grammar)
        return super(PCFGViterbiParser, self).parse(tokens)


f = "/home/esyir/Documents/A-star/NLP/data/GENIA_treebank_v1/10022882.xml"

tree = xmltree.analyze(f)
production = tree[0].productions()

S = nltk.Nonterminal('S')

grammar = nltk.induce_pcfg(S, production)

viterbi_parser = PCFGViterbiParser.train(production, 'S')
tokenized = st.preprocess(
    "all of these results suggested that the slps of both l. acidophilus strains possessed murein hydrolase activities that were sublethal to e. coli cells."
)

a123 = "all of these results suggested that the slps of both l. acidophilus strains possessed murein hydrolase activities that were sublethal to e. coli cells."

tokenized2 = tokenized[0]

print(tokenized2)

print("BAM")
Ejemplo n.º 23
0
def generate_with_word(start_word):
    start_symbol = nltk.Nonterminal("wypowiedzenie:|" + start_word)

    result = generate_symbols(start_symbol)

    print(" ".join(result))
Ejemplo n.º 24
0
from nltk.corpus import BracketParseCorpusReader
from nltk import induce_pcfg

treebank = BracketParseCorpusReader(
    "resources/",
    "skladnica_with_heads.txt",
)

productions = []
for item in treebank.fileids()[:2]:
    for tree in treebank.parsed_sents(item):
        #tree.draw()
        productions += tree.productions()

grammar = induce_pcfg(nltk.Nonterminal('wypowiedzenie:|'), productions)
print(grammar.start())
#print(grammar.productions())
#print(grammar._lhs_index)
#print(grammar.productions(lhs=grammar.start()))

#print(grammar.productions(lhs=nltk.Nonterminal("wypowiedzenie:|mogę")))
#print(grammar.productions(lhs=nltk.Nonterminal("znakkonca:|.")))

used_symbols = []


def generate_symbols(symbol):

    if nltk.grammar.is_terminal(symbol):
Ejemplo n.º 25
0
def main():
     parser = argparse.ArgumentParser()
     parser.add_argument('--seed', metavar='S', type=int,
                         help='the random number generator seed')
     parser.add_argument('--socket', metavar='SOCKET', type=str,
                         help='the local socket to bind to')
     parser.add_argument('--infile', metavar='FILE', type=str,
                         help='read from this file instead of stdin')
     parser.add_argument('--outfile', metavar='FILE', type=str,
                         help='write to this file instead of stdout')
     parser.add_argument('--website', metavar='W', type=str,
                         help='a website link to include, if any '
                         '(must start with "http://")')
     args = parser.parse_args()

     if args.socket:
          ok = scipherd.call_daemon(args.socket, True,
                                    args.infile, args.outfile)
          if ok:
               sys.exit(0)
          else:
               sys.exit(-1)

     if args.seed:
          seed = args.seed
     else:
          seed = random.randint(0, 2**32)
     random.seed(seed)
     sys.stderr.write("Random seed: %d\n" % seed)

     input_text = ""
     for line in sys.stdin:
          input_text += line.decode('utf-8')

     if len(input_text) > 2**20:
          print "Input text must be smaller than 1MB."
          sys.exit(-1)

     common = cfp_common.CfpCommon.get_latest_common()
     space_before = re.compile('\s([%s])' %
                               common.chars_to_remove_a_space_before())
     space_after = re.compile('([%s])\s' %
                              common.chars_to_remove_a_space_after())

     last_or_nots = common.choose_last_or_nots()
     if args.website:
          if args.website.find("http://") != 0:
               sys.stderr.write("Bad website: %s\n" % args.website)
               sys.exit(-1)
          last_or_nots[nltk.Nonterminal("SUBMIT_CLOSING")] = True

     # load grammars
     #print "1) %s" % time.time()
     header_grammar = nltk.data.load("file:%s" % common.header_cfg_filename(),
                                     'cfg')
     body_grammar = nltk.data.load("file:%s" % common.body_cfg_filename(),
                                   'cfg')
     #print "2) %s" % time.time()
     state = EncodeState(input_text, Bitstring(), common, header_grammar,
                         body_grammar, {}, space_before, space_after,
                         last_or_nots, LastTime())
     (header, body) = do_encode(state, args.website)
     print header
     print ""
     print body
Ejemplo n.º 26
0
def main():
    options = parse_args()

    # parameter set 1
    #assert(options.corpus_name!=None);
    assert (options.input_directory != None)
    assert (options.output_directory != None)

    input_directory = options.input_directory
    input_directory = input_directory.rstrip("/")
    corpus_name = os.path.basename(input_directory)

    output_directory = options.output_directory
    if not os.path.exists(output_directory):
        os.mkdir(output_directory)
    output_directory = os.path.join(output_directory, corpus_name)
    if not os.path.exists(output_directory):
        os.mkdir(output_directory)

    assert (options.grammar_file != None)
    grammar_file = options.grammar_file
    assert (os.path.exists(grammar_file))

    # Documents
    train_docs = []
    input_stream = open(os.path.join(input_directory, 'train.dat'), 'r')
    for line in input_stream:
        train_docs.append(line.strip())
    input_stream.close()
    print("successfully load all training documents...")

    # parameter set 2
    if options.number_of_documents > 0:
        number_of_documents = options.number_of_documents
    else:
        number_of_documents = len(train_docs)
    if options.batch_size > 0:
        batch_size = options.batch_size
    else:
        batch_size = number_of_documents
    #assert(number_of_documents % batch_size==0);
    training_iterations = number_of_documents / batch_size
    if options.training_iterations > 0:
        training_iterations = options.training_iterations
    #training_iterations=int(math.ceil(1.0*number_of_documents/batch_size));
    #multiprocesses = options.multiprocesses;
    assert (options.number_of_processes >= 0)
    number_of_processes = options.number_of_processes

    # parameter set 3
    assert (options.grammaton_prune_interval > 0)
    grammaton_prune_interval = options.grammaton_prune_interval
    snapshot_interval = grammaton_prune_interval
    if options.snapshot_interval > 0:
        snapshot_interval = options.snapshot_interval
    assert (options.tau >= 0)
    tau = options.tau
    #assert(options.kappa>=0.5 and options.kappa<=1);
    assert (options.kappa >= 0 and options.kappa <= 1)
    kappa = options.kappa
    if batch_size <= 0:
        print("warning: running in batch mode...")
        kappa = 0

    # read in adaptor grammars
    desired_truncation_level = {}
    alpha_pi = {}
    beta_pi = {}

    grammar_rules = []
    adapted_non_terminals = set()
    #for line in codecs.open(grammar_file, 'r', encoding='utf-8'):
    for line in open(grammar_file, 'r'):
        line = line.strip()
        if line.startswith("%"):
            continue
        if line.startswith("@"):
            tokens = line.split()
            assert (len(tokens) == 5)
            adapted_non_terminal = nltk.Nonterminal(tokens[1])
            adapted_non_terminals.add(adapted_non_terminal)
            desired_truncation_level[adapted_non_terminal] = int(tokens[2])
            alpha_pi[adapted_non_terminal] = float(tokens[3])
            beta_pi[adapted_non_terminal] = float(tokens[4])
            continue
        grammar_rules.append(line)
    grammar_rules = "\n".join(grammar_rules)

    # Warning: if you are using nltk 2.x, please use parse_grammar()
    #from nltk.grammar import parse_grammar, standard_nonterm_parser
    #start, productions = parse_grammar(grammar_rules, standard_nonterm_parser, probabilistic=False)
    from nltk.grammar import read_grammar, standard_nonterm_parser
    start, productions = read_grammar(grammar_rules,
                                      standard_nonterm_parser,
                                      probabilistic=False)
    print("start, productions: ", start, productions)
    # create output directory
    now = datetime.datetime.now()
    suffix = now.strftime("%y%b%d-%H%M%S") + ""
    #desired_truncation_level_string = "".join(["%s%d" % (symbol, desired_truncation_level[symbol]) for symbol in desired_truncation_level]);
    #alpha_pi_string = "".join(["%s%d" % (symbol, alpha_pi[symbol]) for symbol in alpha_pi]);
    #beta_pi_string = "".join(["%s%d" % (symbol, beta_pi[symbol]) for symbol in beta_pi]);
    #output_directory += "-" + str(now.microsecond) + "/";
    suffix += "-D%d-P%d-S%d-B%d-O%d-t%d-k%g-G%s/" % (
        number_of_documents,
        #number_of_topics,
        grammaton_prune_interval,
        snapshot_interval,
        batch_size,
        training_iterations,
        tau,
        kappa,
        #alpha_theta,
        #alpha_pi_string,
        #beta_pi_string,
        #desired_truncation_level_string,
        os.path.basename(grammar_file))

    output_directory = os.path.join(output_directory, suffix)
    os.mkdir(os.path.abspath(output_directory))

    # store all the options to a input_stream
    options_output_file = open(output_directory + "option.txt", 'w')
    # parameter set 1
    options_output_file.write("input_directory=" + input_directory + "\n")
    options_output_file.write("corpus_name=" + corpus_name + "\n")
    options_output_file.write("grammar_file=" + str(grammar_file) + "\n")
    # parameter set 2
    options_output_file.write("number_of_processes=" +
                              str(number_of_processes) + "\n")
    #options_output_file.write("multiprocesses=" + str(multiprocesses) + "\n");
    options_output_file.write("number_of_documents=" +
                              str(number_of_documents) + "\n")
    options_output_file.write("batch_size=" + str(batch_size) + "\n")
    options_output_file.write("training_iterations=" +
                              str(training_iterations) + "\n")

    # parameter set 3
    options_output_file.write("grammaton_prune_interval=" +
                              str(grammaton_prune_interval) + "\n")
    options_output_file.write("snapshot_interval=" + str(snapshot_interval) +
                              "\n")
    options_output_file.write("tau=" + str(tau) + "\n")
    options_output_file.write("kappa=" + str(kappa) + "\n")

    # parameter set 4
    #options_output_file.write("alpha_theta=" + str(alpha_theta) + "\n");
    options_output_file.write("alpha_pi=%s\n" % alpha_pi)
    options_output_file.write("beta_pi=%s\n" % beta_pi)
    options_output_file.write("desired_truncation_level=%s\n" %
                              desired_truncation_level)
    # parameter set 5
    #options_output_file.write("heldout_data=" + str(heldout_data) + "\n");
    options_output_file.close()

    print("========== ========== ========== ========== ==========")
    # parameter set 1
    print("output_directory=" + output_directory)
    print("input_directory=" + input_directory)
    print("corpus_name=" + corpus_name)
    print("grammar_file=" + str(grammar_file))

    # parameter set 2
    print("number_of_documents=" + str(number_of_documents))
    print("batch_size=" + str(batch_size))
    print("training_iterations=" + str(training_iterations))
    print("number_of_processes=" + str(number_of_processes))
    #print("multiprocesses=" + str(multiprocesses)

    # parameter set 3
    print("grammaton_prune_interval=" + str(grammaton_prune_interval))
    print("snapshot_interval=" + str(snapshot_interval))
    print("tau=" + str(tau))
    print("kappa=" + str(kappa))

    # parameter set 4
    #print("alpha_theta=" + str(alpha_theta)
    print("alpha_pi=%s" % alpha_pi)
    print("beta_pi=%s" % beta_pi)
    print("desired_truncation_level=%s" % desired_truncation_level)
    # parameter set 5
    #print("heldout_data=" + str(heldout_data)
    print("========== ========== ========== ========== ==========")

    import hybrid
    print("passing prodcutions = : ", productions)
    adagram_inferencer = hybrid.Hybrid(start, productions,
                                       adapted_non_terminals)

    adagram_inferencer._initialize(number_of_documents, batch_size, tau, kappa,
                                   alpha_pi, beta_pi, None,
                                   desired_truncation_level,
                                   grammaton_prune_interval)
    '''
    clock_iteration = time.time();
    clock_e_step, clock_m_step = adagram_inferencer.seed(train_docs);
    clock_iteration = time.time()-clock_iteration;
    print('E-step, M-step and Seed take %g, %g and %g seconds respectively...' % (clock_e_step, clock_m_step, clock_iteration);p
    '''

    #adagram_inferencer.export_adaptor_grammar(os.path.join(output_directory, "infag-0"))
    #adagram_inferencer.export_aggregated_adaptor_grammar(os.path.join(output_directory, "ag-0"))

    random.shuffle(train_docs)
    training_clock = time.time()
    snapshot_clock = time.time()
    for iteration in range(int(training_iterations)):
        start_index = batch_size * iteration
        end_index = batch_size * (iteration + 1)
        if start_index / number_of_documents < end_index / number_of_documents:
            #train_doc_set = train_docs[(batch_size * iteration) % (number_of_documents) :] + train_docs[: (batch_size * (iteration+1)) % (number_of_documents)];
            train_doc_set = train_docs[(batch_size * iteration) %
                                       (number_of_documents):]
            random.shuffle(train_docs)
            train_doc_set += train_docs[:(batch_size * (iteration + 1)) %
                                        (number_of_documents)]
        else:
            train_doc_set = train_docs[(batch_size * iteration) %
                                       (number_of_documents):
                                       (batch_size *
                                        (iteration + 1)) % number_of_documents]

        clock_iteration = time.time()
        #print("processing document:", train_doc_set
        clock_e_step, clock_m_step = adagram_inferencer.learning(
            train_doc_set, number_of_processes)

        if (iteration + 1) % snapshot_interval == 0:
            #pickle_file = open(os.path.join(output_directory, "model-%d" % (adagram_inferencer._counter+1)), 'wb');
            #pickle.dump(adagram_inferencer, pickle_file);
            #pickle_file.close();
            adagram_inferencer.export_adaptor_grammar(
                os.path.join(output_directory, "infag-" + str(
                    (iteration + 1))))
            #adagram_inferencer.export_aggregated_adaptor_grammar(os.path.join(output_directory, "ag-" + str((iteration+1))))

        if (iteration + 1) % 1000 == 0:
            snapshot_clock = time.time() - snapshot_clock
            print('Processing 1000 mini-batches take %g seconds...' %
                  (snapshot_clock))
            snapshot_clock = time.time()

        clock_iteration = time.time() - clock_iteration
        print(
            'E-step, M-step and iteration %d take %g, %g and %g seconds respectively...'
            % (adagram_inferencer._counter, clock_e_step, clock_m_step,
               clock_iteration))

    adagram_inferencer.export_adaptor_grammar(
        os.path.join(output_directory,
                     "infag-" + str(adagram_inferencer._counter + 1)))
    #adagram_inferencer.export_aggregated_adaptor_grammar(os.path.join(output_directory, "ag-" + str((iteration+1))))

    pickle_file = open(
        os.path.join(output_directory, "model-%d" % (iteration + 1)), 'wb')
    pickle.dump(adagram_inferencer, pickle_file)
    pickle_file.close()

    training_clock = time.time() - training_clock
    print('Training finished in %g seconds...' % (training_clock))
treeData = getTreeData(data)
print("Done!")
print(" ")

########################
## Q u e s t i o n 6a ##
########################

#list to store all the extracted rules
treeProductions = []

# productions() function is used to extract the grammar rule for each sentence
for tr in treeData:  # for tree in treeData
    treeProductions += tr.productions()

S = nltk.Nonterminal("S")
grammar = nltk.induce_pcfg(S, treeProductions)

### Extracting PCFG to a text file
#grammar_PCFG = str(grammar)
#file = open('/Users/mayapetranova/Documents/QMUL/NLP/assignment_2/6/PCFG.txt', 'w')
#file.write(grammar_PCFG)
#file.close()

########################
## Q u e s t i o n 6b ##
########################

sentence = "show me the meals on the flight from Phoenix".split()
parser = pchart.InsideChartParser(grammar)
for tp in parser.parse(sentence):
Ejemplo n.º 28
0
def get_number(tree, grammar, state, in_list_arg = None):
    if type(tree) != nltk.Tree:
         return ((tree,), [])
    nt_label = nltk.Nonterminal(tree.label())
    if state.done.done:
         return ((nt_label,), [])

    prods = grammar.productions(nt_label)
    rhs = ()
    num = []

    # come up with a format with the right number of leading zeroes
    bits = 0
    if len(prods) >= 3:
         bits = int(math.log(len(prods)-1, 2))
    formatstr = "{0:0%db}" % bits
    prevPow2 = math.pow(2, bits)

    in_list = in_list_arg
    if not in_list and nt_label in state.list_bits:
         in_list = nt_label

    use_bits = True
    if nt_label in state.common.choose_last_or_nots():
         use_bits = False

    # Consume list bits before we start recursing, since that's the order
    # ./encode.py does it.
    is_list = False
    if bits > 0 and in_list in state.list_bits:
         is_list = True
         if use_bits:
              # if we know that this is going to be the end of a list such
              # that the power of 2 was chosen, then don't bother subtracting
              # the bits from the main done.
              if (nt_label in state.common.list_recursive_terms() and
                  state.list_bits[in_list] <= 0 and len(state.list_bits) > 1):
                   use_bits = False

              state.list_bits[in_list] -= bits
              #print ("Consumed %d bits for list %s, %s left" %
              #       (bits, in_list, list_bits[in_list]))

    prev_bits_left = state.done.bits_left
    if use_bits:
         #print "(%s) Consumed %s bits for %s" % (prev_bits_left, bits, nt_label)
         state.done.bits_left -= bits

    # set up list_bits if needed, before recursing:
    subtrees = tree.subtrees().next()
    if len(state.list_bits) == 0 and nt_label == nltk.Nonterminal("CFP_BODY"):
         body_rhs = tuple(nltk.Nonterminal(t.label()) for t in subtrees)
         for p in prods:
              if p.rhs() == body_rhs:
                   state.list_bits.update(
                        state.common.calc_list_bits(state.done.total_len, p))
                   break

    child_bits = 0
    for t in subtrees:
        (t_rhs, t_num) = get_number(t, grammar, state, in_list)
        rhs += t_rhs
        num.extend(t_num)

    end_list = False
    if bits == 0:
         # If this had fewer than 3 rules, the first one was always
         # used, so don't produce any bits
         return ((nt_label,), num)
    elif is_list:
         if (in_list not in state.list_bits or
             (state.list_bits[in_list] <= 0 and len(state.list_bits) > 1)):
              end_list = True

    for i in range(len(prods)):
        if prods[i].rhs() == rhs:
            if (len(state.list_bits) == 0 and
                nt_label == nltk.Nonterminal("CFP_BODY")):
                 state.list_bits.update(state.common.calc_list_bits(
                      state.done.total_len, prods[i]))

            if prev_bits_left <= 0:
                state.done.done = True
                return ((nt_label,), [])
            elif is_list and i == prevPow2:
                 if use_bits:
                      state.done.bits_left += bits  # encode didn't count these
                 if in_list in state.list_bits:
                      bits_left = state.list_bits[in_list]
                      if bits_left <= 0 and len(state.list_bits) > 1:
                           del state.list_bits[in_list]
                 # end of the list -- still count the choices below us
                 return ((nt_label,), num)
            else:
                 istring = formatstr.format(i)
                 #print ("(%s) Using %s bits %s (%s -> %s)" %
                 #       (prev_bits_left, bits, istring, nt_label, rhs))
                 return ((nt_label,), [istring]+num)
    print "Couldn't find rhs for label %s, rhs %s" % (rhs, tree.label())
@author: SHILPASHREE RAO
"""
import sys
import nltk
from itertools import islice
from nltk import Tree
import collections

#with open(sys.argv[1], 'r') as f:
with open('train.trees.pre.unk', 'r') as f:
    #    data = f.read()
    data = f.readlines()

productions = []
t = []
TOP = nltk.Nonterminal('TOP')
for i in data:
    t = Tree.fromstring(i)
    productions += t.productions()
grammar = nltk.induce_pcfg(TOP, productions)
lefRule = []
rightRule = []
for i in productions:
    lefRule.append(i.lhs())
    rightRule.append(i.rhs())
tupList = zip(lefRule, rightRule)
count = collections.Counter(tupList)
maxv = max([i for i in count.values()])
out = (count.keys()[count.values().index(maxv)], maxv)
print "Frequently occuring rule and it's frequency", out
num = len(grammar.productions())
import nltk
import ancora
path = 'ancora/ancora-3.0.1es/'
corpus = ancora.AncoraCorpusReader(path)

t = corpus.parsed_sents()[0]
t.draw()
t.productions()

prods = []
for t in corpus.parsed_sents():
    prods += t.productions()

#print (prods)

S = nltk.Nonterminal('sentence')
grammar = nltk.induce_pcfg(S, prods)

#prods2 = grammar.productions(lhs=nltk.Nonterminal('ncms000'))
#print (prods2)

print("===============================================================")
print("===============================================================")

parser = nltk.ViterbiParser(grammar)
for tree in parser.parse("El gato come pescado crudo .".split()):
    print(tree)
    tree.draw()
    tree.prob()