def __init__(self, clusterNum, dataDir, lemmaDir, outputDir, printStats=False):
		#print "* Parsing cluster " + str(clusterNum)
		self.clusterNum = clusterNum
		self.references = defaultdict(list) # ref_id -> [m_id1, m_id2, ... ] which spans all docs in the cluster
		self.docs = defaultdict(Document) # doc_id -> Document

		# NOTE: the point of this variable is just print stats and see how many
		#       Mentions are 'singletons' (not encompassed by any Ref)
		self.mentions = defaultdict(list) # m_id -> [ref_id1, ref_id2, ... ] # which spans all docs in the cluster
		self.headDoc = str(clusterNum) + "_1ecbplus.xml"
		makeGoldTruth = False
		# data directories
		self.dataDir = dataDir + str(clusterNum) + '/'
		self.lemmaDir = lemmaDir + str(clusterNum) + '/ecbplus/'
		self.outputDir = outputDir# + 'clusterMentions/'
		self.numPairs = 0
		self.numMentions = 0
		# makes the gold truth files
		self.mentionsList = []

		# iterates through each file in the given dir
		for f in glob(self.dataDir + '*plus.xml'):

			#print "file: " + str(f)
			doc_id = f[f.rfind("/") + 1:]
			doc = Document(doc_id)

			tokenIDs = defaultdict(str)

			# gets the contents of the file
			with open (f, "r") as myfile:
				fileContents=myfile.read().replace('\n', ' ')

			# reads <tokens>
			it = tuple(re.finditer(r"<token t\_id=\"(\d+)\" sentence=\"(\d+)\".*?>(.*?)</(.*?)>", fileContents))
			for match in it:
				t_id = int(match.group(1))
				token = match.group(3)
				tokenIDs[t_id] = token

			# reads <markers>
			regex = r"<([\w]+) m_id=\"(\d+)?\".*?>(.*?)?</.*?>"
			markables = fileContents[fileContents.find("<Markables>")+11:fileContents.find("</Markables>")]
			it = tuple(re.finditer(regex, markables))
			for match in it:
				isPred = False
				if "ACTION" in match.group(1):
					isPred = True
				m_id = int(match.group(2))

				# gets the token IDs
				regex2 = r"<token_anchor t_id=\"(\d+)\".*?/>"
				it2 = tuple(re.finditer(regex2, match.group(3)))
				curTokenIDs = []
				text = ""
				for match2 in it2:
					tokenID = int(match2.group(1))
					curTokenIDs.append(tokenID)
					text = text + str(tokenIDs[tokenID]) + " "
				text = text.rstrip()

				# constructs the Mention
				mention = Mention(m_id, text, curTokenIDs, isPred, doc_id)

				# adds to the Doc and stores it (we will update the Doc w/ ref info below)
				doc.addMention(mention)

			# reads <relations>
			relations = fileContents[fileContents.find("<Relations>"):fileContents.find("</Relations>")]
			regex = r"<CROSS_DOC_COREF.*?note=\"(.+?)\".*?>(.*?)?</.*?>"
			it = tuple(re.finditer(regex, relations))
			for match in it:
				ref_id = match.group(1)
				regex2 = r"<source m_id=\"(\d+)\".*?/>"
				it2 = tuple(re.finditer(regex2, match.group(2)))

				for match2 in it2:
					m_id = int(match2.group(1))

					doc.mentions[m_id].addReference(ref_id)
					if doc.mentions[m_id] in self.references[ref_id]:
						print "** we already have the mention added to the ref!"
						exit(1)
					else:
						self.references[ref_id].append(doc.mentions[m_id])

					

				# adds the current ref_id to the Doc
				doc.addRef(ref_id)

			self.docs[doc_id] = doc # stores the Doc object locally

			# now let's read the lemmas, provided by Sergio's StanfordNLP-parsed files
			f_lemma = open(self.lemmaDir + doc_id, 'r')
			fileContents = f_lemma.read().replace('\n', ' ')
			lemmaContent = fileContents[fileContents.find("<lemmas>")+8:fileContents.find("</lemmas>")]
			regex = r"<span m\_id=\"(.+?)/(.+?)\".+?pos=\"(.+?)\">(.+?)</span>" #(.*?)?</.*?>"
			it = tuple(re.finditer(regex, lemmaContent))
			for match in it:
				filename = match.group(1)
				cur_m_id = match.group(2)
				pos = match.group(3)
				lemma = match.group(4)

				posTags = pos.split()

				# adds the lemma to the Mention; hey, pass by reference works!
				curDoc = self.docs[filename]
				curDoc.mentions[int(cur_m_id)].addLemma(lemma)
				curDoc.mentions[int(cur_m_id)].addPOSTags(posTags)


		# prints how many possible mention-pair combinations we have per cluster
		for d1 in self.docs.keys():
			doc1 = self.docs[d1]
			#print "doc" + str(d1) + " has " + str(len(doc1.mentionsList))
			for m1 in doc1.mentionsList:

				for d2 in self.docs.keys():
					doc2 = self.docs[d2]
					for m2 in doc2.mentionsList:
						if m1 != m2:
							self.numPairs = self.numPairs + 1
		#print "numpairs; " + str(self.numPairs)
		# # PRINTS THE GOLDEN TRUTH FILE (1 per cluster, i later need to cat htem together)
		fout = open(self.outputDir + str(self.clusterNum) + ".txt", 'w')
		for r in self.references.keys():
			#if r not in self.docs[self.headDoc].refs:
			for m in self.references[r]:
				self.numMentions = self.numMentions + 1
				self.mentionsList.append(str(m.doc_id) + ";" + str(m.m_id)) # TODO: only used for debugging
				fout.write(str(self.clusterNum) + ";" + str(r) + ";" + str(m.doc_id) + ";" + str(m.m_id) + ";" + str(m.text.lower()) + ";" + str(m.lemma.lower()) + "\n")
		fout.close()
		print str(self.clusterNum) + " -> " + str(self.numMentions) + " mentions"


		if printStats:
			# constructs FILE
			fout = open(self.outputDir + 'clusterMentions/' + str(self.clusterNum) + ".txt", 'w')

			# collects stats: of how many Mentions are per each Reference?
			# i.e., 1 mention per Ref happens 13 times
			#       2 mentions per Ref happens 4 times
			#       3 mentions per Ref happens 5 times
			self.numMentionCounts = defaultdict(int)
			for r in self.references.keys():
				count = len(self.references[r])
				self.numMentionCounts[count] = self.numMentionCounts[count] + 1

			sorted_x = sorted(self.numMentionCounts.items(), key=operator.itemgetter(0))
			fout.write("# Mentions per Ref, # times this occurred\n")
			fout.write("------------------------------\n")
			for i in sorted_x:
				fout.write(str(i[0]) + "," + str(i[1]) + "\n")
			fout.write("\n")

			fout.write("-------------------------- REFS NOT CONTAINED IN HEAD DOC --------------------------------\n")
			for r in self.references.keys():
				#if r not in self.docs[self.headDoc].refs:
				fout.write("\t " + r + " (" + str(len(self.references[r])) + " mentions):\n")
				for m in self.references[r]:
					fout.write("\t\t" + m.text + " (doc " + str(m.doc_id) + "; m_id: " + str(m.m_id) + "; lemma: " + m.lemma + ")\n")

			fout.close()