def map_DDT_tags_to_CST_terminals(self):
		grammar = Grammar()
		with open("DDT-to-CST-mapping.csv", "r") as file:
			data = csv.reader(file)
			for row in data:
				gr = GrammarRule(row[1], [row[0]], 0)
				grammar.count_rule(gr)
		return grammar
	def parse_treebank(self, treebank_filename):

		# Reading and xml-parsing the DDT file
		print ">>TREEBANK: Reading and xml-parsing the DDT file."
		with open(treebank_filename, "r") as file:
			treebank = BeautifulSoup(file.read())
		print ">>LOG: Time spent is %s seconds" % self.log.time_since_last_check()

		grammar = Grammar()

		# Extracting grammar from sentences
		print ">>TREEBANK: Extracting grammar from sentences."
		for sentence in treebank.findAll('s'):
			terminals = {}
			nonterminals = {}
			# extract terminals
			terminals_xml = sentence.findAll('t')
			for t in terminals_xml:
				terminal = Terminal(t.get('id'), t.get('cat'), t.get('lemma'), t.get('word'))
				terminals[terminal.id] = terminal

			# extract nonterminals
			nonterminals_xml = sentence.findAll('nt')
			for nt in nonterminals_xml:
				nonterminal = Nonterminal(nt.get('id'), nt.get('cat'), nt.get('lemma'), nt.get('word'))
				edges = []
				for e in nt.findAll('edge'):
					edges.append([e.get('idref'), e.get('label')])
				nonterminal.add_edges(edges)
				nonterminals[nonterminal.id] = nonterminal

			# Count edge --> nonterminals
			for nt in nonterminals:
				edge_startpoint_cat = nonterminals[nt].category
				for e in nonterminals[nt].edges:
					if e[0] in nonterminals:
						edge_endpoint_cat = nonterminals[e[0]].category
					else:
						edge_endpoint_cat = terminals[e[0]].category

					gr = GrammarRule(e[1], [edge_startpoint_cat, edge_endpoint_cat], 0)
					grammar.count_rule(gr)

			# Count nonterminal --> edges
			for nt in nonterminals:			
				if len(nonterminals[nt].edges) > 1:
					constituents = []
					for e in nonterminals[nt].edges:
						if e[1] != "--":
							constituents.append(e[1])

					gr = GrammarRule(nonterminals[nt].category, constituents, 0)
					grammar.count_rule(gr)

		print ">>TREEBANK: Done. Total runtime %s seconds" % (time.time() - self.log.starttime)

		return grammar
	def new_treebank_parser(self, treebank_filename):

		# Reading and xml-parsing the DDT file
		print ">>TREEBANK: Reading and xml-parsing the DDT file."
		test = False
		if test: treebank_filename = "test_sentence"
		with open(treebank_filename, "r") as file:
			treebank = BeautifulSoup(file.read())
		print ">>LOG: Time spent is %s seconds" % self.log.time_since_last_check()

		grammar = Grammar()

		# Extracting grammar from sentences
		print ">>TREEBANK: Extracting grammar from sentences."
		for sentence in treebank.findAll('s'):
			terminals = {}
			nonterminals = {}
			# extract terminals
			terminals_xml = sentence.findAll('t')
			for t in terminals_xml:
				terminal = Terminal(t.get('id'), t.get('cat'), t.get('lemma'), t.get('word'))
				terminals[terminal.id] = terminal

			# extract nonterminals
			nonterminals_xml = sentence.findAll('nt')
			for nt in nonterminals_xml:
				nonterminal = Nonterminal(nt.get('id'), nt.get('cat'), nt.get('lemma'), nt.get('word'))
				edges = []
				for e in nt.findAll('edge'):
					edges.append([e.get('idref'), e.get('label')])
				nonterminal.add_edges(edges)
				nonterminals[nonterminal.id] = nonterminal

			for nt in nonterminals:
				if len(nonterminals[nt].edges) > 1:
					cat = nonterminals[nt].category + "P"
					r = []
					for e in nonterminals[nt].edges:
						if e[1] != "--":
							if len(nonterminals[e[0]].edges) > 1:
								r.append(nonterminals[e[0]].category + "P")
							else:
								r.append(nonterminals[e[0]].category)
						else:
							r.append(terminals[e[0]].category)
					gr = GrammarRule(cat, r, 0)
					grammar.count_rule(gr)

		return grammar
    def run(self):

        print ">>TREEBANK: Reading and xml-parsing the DDT file."
        with open("ddt-1.0.xml", "r") as file:
            treebank = BeautifulSoup(file.read(), "xml")
        print ">>LOG: Time spent is %s seconds" % self.log.time_since_last_check()

        counter = 0
        grammar = Grammar()

        print ">>TREEBANK: Extracting grammar from sentences."
        for sentence in treebank.findAll("s"):
            terminals = {}
            nonterminals = {}
            # extract terminals
            terminals_xml = sentence.findAll("t")
            for t in terminals_xml:
                terminal = Terminal(t.get("id"), t.get("cat"), t.get("lemma"), t.get("word"))
                terminals[terminal.id] = terminal

                # extract nonterminals
            nonterminals_xml = sentence.findAll("nt")
            for nt in nonterminals_xml:
                nonterminal = Nonterminal(nt.get("id"), nt.get("cat"), nt.get("lemma"), nt.get("word"))
                edges = []
                for e in nt.findAll("edge"):
                    edges.append([e.get("idref"), e.get("label")])
                nonterminal.add_edges(edges)
                nonterminals[nonterminal.id] = nonterminal

                # Count edge --> nonterminals
            for nt in nonterminals:
                edge_startpoint_cat = nonterminals[nt].category
                for e in nonterminals[nt].edges:
                    if e[0] in nonterminals:
                        edge_endpoint_cat = nonterminals[e[0]].category
                    else:
                        edge_endpoint_cat = terminals[e[0]].category

                    gr = GrammarRule(e[1], [edge_startpoint_cat, edge_endpoint_cat])
                    grammar.count_rule(gr)

                    # Count nonterminal --> edges
            for nt in nonterminals:
                if len(nonterminals[nt].edges) > 1:
                    constituents = []
                    for e in nonterminals[nt].edges:
                        if e[1] != "--":
                            constituents.append(e[1])

                    gr = GrammarRule(nonterminals[nt].category, constituents)
                    grammar.count_rule(gr)

        print ">>LOG: Time spent is %s seconds" % self.log.time_since_last_check()
        # 		grammar.print_grammar()

        print ">>TREEBANK: Pickling the temporary grammar."
        with open("temp_grammar", "w") as file:
            pickle.dump(grammar, file)
        print ">>LOG: Time spent is %s seconds" % self.log.time_since_last_check()

        print ">>TREEBANK: Total runtime %s seconds" % (time.time() - self.log.starttime)