def __init__(self, clusterNum, dataDir, lemmaDir, outputDir, printStats=False):
		#print "* Parsing cluster " + str(clusterNum)
		self.clusterNum = clusterNum
		self.references = defaultdict(list) # ref_id -> [m_id1, m_id2, ... ] which spans all docs in the cluster
		self.docs = defaultdict(Document) # doc_id -> Document

		# NOTE: the point of this variable is just print stats and see how many
		#       Mentions are 'singletons' (not encompassed by any Ref)
		self.mentions = defaultdict(list) # m_id -> [ref_id1, ref_id2, ... ] # which spans all docs in the cluster
		self.headDoc = str(clusterNum) + "_1ecbplus.xml"
		makeGoldTruth = False
		# data directories
		self.dataDir = dataDir + str(clusterNum) + '/'
		self.lemmaDir = lemmaDir + str(clusterNum) + '/ecbplus/'
		self.outputDir = outputDir# + 'clusterMentions/'
		self.numPairs = 0
		self.numMentions = 0
		# makes the gold truth files
		self.mentionsList = []

		# iterates through each file in the given dir
		for f in glob(self.dataDir + '*plus.xml'):

			#print "file: " + str(f)
			doc_id = f[f.rfind("/") + 1:]
			doc = Document(doc_id)

			tokenIDs = defaultdict(str)

			# gets the contents of the file
			with open (f, "r") as myfile:
				fileContents=myfile.read().replace('\n', ' ')

			# reads <tokens>
			it = tuple(re.finditer(r"<token t\_id=\"(\d+)\" sentence=\"(\d+)\".*?>(.*?)</(.*?)>", fileContents))
			for match in it:
				t_id = int(match.group(1))
				token = match.group(3)
				tokenIDs[t_id] = token

			# reads <markers>
			regex = r"<([\w]+) m_id=\"(\d+)?\".*?>(.*?)?</.*?>"
			markables = fileContents[fileContents.find("<Markables>")+11:fileContents.find("</Markables>")]
			it = tuple(re.finditer(regex, markables))
			for match in it:
				isPred = False
				if "ACTION" in match.group(1):
					isPred = True
				m_id = int(match.group(2))

				# gets the token IDs
				regex2 = r"<token_anchor t_id=\"(\d+)\".*?/>"
				it2 = tuple(re.finditer(regex2, match.group(3)))
				curTokenIDs = []
				text = ""
				for match2 in it2:
					tokenID = int(match2.group(1))
					curTokenIDs.append(tokenID)
					text = text + str(tokenIDs[tokenID]) + " "
				text = text.rstrip()

				# constructs the Mention
				mention = Mention(m_id, text, curTokenIDs, isPred, doc_id)

				# adds to the Doc and stores it (we will update the Doc w/ ref info below)
				doc.addMention(mention)

			# reads <relations>
			relations = fileContents[fileContents.find("<Relations>"):fileContents.find("</Relations>")]
			regex = r"<CROSS_DOC_COREF.*?note=\"(.+?)\".*?>(.*?)?</.*?>"
			it = tuple(re.finditer(regex, relations))
			for match in it:
				ref_id = match.group(1)
				regex2 = r"<source m_id=\"(\d+)\".*?/>"
				it2 = tuple(re.finditer(regex2, match.group(2)))

				for match2 in it2:
					m_id = int(match2.group(1))

					doc.mentions[m_id].addReference(ref_id)
					if doc.mentions[m_id] in self.references[ref_id]:
						print "** we already have the mention added to the ref!"
						exit(1)
					else:
						self.references[ref_id].append(doc.mentions[m_id])

					

				# adds the current ref_id to the Doc
				doc.addRef(ref_id)

			self.docs[doc_id] = doc # stores the Doc object locally

			# now let's read the lemmas, provided by Sergio's StanfordNLP-parsed files
			f_lemma = open(self.lemmaDir + doc_id, 'r')
			fileContents = f_lemma.read().replace('\n', ' ')
			lemmaContent = fileContents[fileContents.find("<lemmas>")+8:fileContents.find("</lemmas>")]
			regex = r"<span m\_id=\"(.+?)/(.+?)\".+?pos=\"(.+?)\">(.+?)</span>" #(.*?)?</.*?>"
			it = tuple(re.finditer(regex, lemmaContent))
			for match in it:
				filename = match.group(1)
				cur_m_id = match.group(2)
				pos = match.group(3)
				lemma = match.group(4)

				posTags = pos.split()

				# adds the lemma to the Mention; hey, pass by reference works!
				curDoc = self.docs[filename]
				curDoc.mentions[int(cur_m_id)].addLemma(lemma)
				curDoc.mentions[int(cur_m_id)].addPOSTags(posTags)


		# prints how many possible mention-pair combinations we have per cluster
		for d1 in self.docs.keys():
			doc1 = self.docs[d1]
			#print "doc" + str(d1) + " has " + str(len(doc1.mentionsList))
			for m1 in doc1.mentionsList:

				for d2 in self.docs.keys():
					doc2 = self.docs[d2]
					for m2 in doc2.mentionsList:
						if m1 != m2:
							self.numPairs = self.numPairs + 1
		#print "numpairs; " + str(self.numPairs)
		# # PRINTS THE GOLDEN TRUTH FILE (1 per cluster, i later need to cat htem together)
		fout = open(self.outputDir + str(self.clusterNum) + ".txt", 'w')
		for r in self.references.keys():
			#if r not in self.docs[self.headDoc].refs:
			for m in self.references[r]:
				self.numMentions = self.numMentions + 1
				self.mentionsList.append(str(m.doc_id) + ";" + str(m.m_id)) # TODO: only used for debugging
				fout.write(str(self.clusterNum) + ";" + str(r) + ";" + str(m.doc_id) + ";" + str(m.m_id) + ";" + str(m.text.lower()) + ";" + str(m.lemma.lower()) + "\n")
		fout.close()
		print str(self.clusterNum) + " -> " + str(self.numMentions) + " mentions"


		if printStats:
			# constructs FILE
			fout = open(self.outputDir + 'clusterMentions/' + str(self.clusterNum) + ".txt", 'w')

			# collects stats: of how many Mentions are per each Reference?
			# i.e., 1 mention per Ref happens 13 times
			#       2 mentions per Ref happens 4 times
			#       3 mentions per Ref happens 5 times
			self.numMentionCounts = defaultdict(int)
			for r in self.references.keys():
				count = len(self.references[r])
				self.numMentionCounts[count] = self.numMentionCounts[count] + 1

			sorted_x = sorted(self.numMentionCounts.items(), key=operator.itemgetter(0))
			fout.write("# Mentions per Ref, # times this occurred\n")
			fout.write("------------------------------\n")
			for i in sorted_x:
				fout.write(str(i[0]) + "," + str(i[1]) + "\n")
			fout.write("\n")

			fout.write("-------------------------- REFS NOT CONTAINED IN HEAD DOC --------------------------------\n")
			for r in self.references.keys():
				#if r not in self.docs[self.headDoc].refs:
				fout.write("\t " + r + " (" + str(len(self.references[r])) + " mentions):\n")
				for m in self.references[r]:
					fout.write("\t\t" + m.text + " (doc " + str(m.doc_id) + "; m_id: " + str(m.m_id) + "; lemma: " + m.lemma + ")\n")

			fout.close()
	def createSemanticSpaceSimVectors(self, outPickle, outFile, N, W, sliceNum, totalSlices):
		print "* creating semantic space vectors"
		fullWindowSize = W*2 + 1

		outPickleFile = self.outputDir + outPickle
		outputFile = self.outputDir + outFile
		mentionTypes = [] # stores the tokens found within Mentions (non-stopwords and > 1 in length)

		# gets the 2,000 most popular words (non-stopwords and > 1 in length)
		print "* gathering most popular " + str(N) + " words"
		sys.stdout.flush()

		wordCounts = defaultdict(int)
		docs = []
		for clusterNum in self.validClusters:

			# iterates through each file in the given dir/cluster
			for f in glob(self.dataDir + str(clusterNum) + '/*plus.xml'):

				doc_id = f[f.rfind("/") + 1:]
				doc = Document(doc_id)

				tokenIDs = defaultdict(str)
				# gets the contents of the file
				with open (f, "r") as myfile:
					fileContents=myfile.read().replace('\n', ' ')

				# reads <tokens>
				it = tuple(re.finditer(r"<token t\_id=\"(\d+)\" sentence=\"(\d+)\".*?>(.*?)</(.*?)>", fileContents))
				for match in it:
					t_id = int(match.group(1))
					sent_num = int(match.group(2))
					token = match.group(3).lower()
					tokenIDs[t_id] = token
					if sent_num > 0 and token not in self.stopwords and len(token) > 1:
						wordCounts[token] = wordCounts[token] + 1
						if token not in mentionTypes:
							mentionTypes.append(token)


				# reads <markers>
				regex = r"<([\w]+) m_id=\"(\d+)?\".*?>(.*?)?</.*?>"
				markables = fileContents[fileContents.find("<Markables>")+11:fileContents.find("</Markables>")]
				it = tuple(re.finditer(regex, markables))
				for match in it:
					isPred = False
					if "ACTION" in match.group(1):
						isPred = True
					m_id = int(match.group(2))

					# gets the token IDs
					regex2 = r"<token_anchor t_id=\"(\d+)\".*?/>"
					it2 = tuple(re.finditer(regex2, match.group(3)))
					curTokenIDs = []
					text = ""
					for match2 in it2:
						tokenID = int(match2.group(1))
						curTokenIDs.append(tokenID)
						text = text + str(tokenIDs[tokenID]) + " "
					text = text.rstrip()

					# constructs the Mention
					mention = Mention(m_id, text, curTokenIDs, isPred, doc_id)

					# adds to the Doc and stores it (we will update the Doc w/ ref info below)
					doc.addMention(mention)

				docs.append(doc)
		
		print "* there were " + str(len(docs)) + " unique docs"
		sys.stdout.flush()

		# puts the top N words into a 'topWords'
		sorted_wordCounts = sorted(wordCounts.items(), key=operator.itemgetter(1), reverse=True)
		commonTypes = [x[0] for x in sorted_wordCounts][0:N]

		print "# unique mention tokens: " + str(len(mentionTypes))

		# goes through all docs again, this time i do the sliding window
		# in order to calculate the PMI1 and PMI2, where
		# PMI1 = freq(p,c) / (freq(p)* freq(c))
		# PMI2 = log(prob(p,c) / (prob(x)*prob(c)))
		mentionCounts = defaultdict(int)
		commonWordsCounts = defaultdict(int)
		mentionAndCommonCounts = defaultdict(int)
		print "* calculating PMI counts for all Mentions across all clusters of docs"
		for clusterNum in self.validClusters:
			# iterates through each file in the given dir/cluster
			for f in glob(self.dataDir + str(clusterNum) + '/*plus.xml'):

				#docTokens = []
				mentionLocations = defaultdict(list)
				commonWordsLocations = defaultdict(list)

				# gets the contents of the file
				with open (f, "r") as myfile:
					fileContents=myfile.read().replace('\n', ' ')

				# reads <tokens>
				it = tuple(re.finditer(r"<token t\_id=\"(\d+)\" sentence=\"(\d+)\".*?>(.*?)</(.*?)>", fileContents))
				i = 1
				for match in it:
					sent_num = int(match.group(2))
					token = match.group(3).lower()
					if sent_num > 0 and token not in self.stopwords and len(token) > 1:
						#docTokens.append(token)

						if token in commonTypes:
							commonWordsLocations[token].append(i)
							commonWordsCounts[token] = commonWordsCounts[token] + 1
						if token in mentionTypes:
							mentionLocations[token].append(i)
							mentionCounts[token] = mentionCounts[token] + 1

						i = i + 1
				#print "Mentions: " + str(mentionLocations)
				#print "commons: " + str(commonWordsLocations)

				# looks at every Mention to see if a common word appeared within W tokens on either side of it
				for m in mentionLocations.keys():
					for l in mentionLocations[m]:
						lower = l - W
						upper = l + W

						for c in commonWordsLocations.keys():
							
							if c != m:
								for l2 in commonWordsLocations[c]:
									if l2 >= lower and l2 <= upper:
										mentionAndCommonCounts[(m,c)] = mentionAndCommonCounts[(m,c)] + 1

		# removes singletons
		# sorted_m = sorted(mentionCounts.items(), key=operator.itemgetter(1), reverse=True)
		# single = 0
		# for m in sorted_m:
		# 	print str(m[0]) + " -> " + str(m[1])
		# 	if m[1] < 3:
		# 		single = single+1
		# 		mentionCounts.pop(m[0], "None")
		# print "singletons: " + str(single) + " out of " + str(len(sorted_m))
		
		# constructs the pmi vector for each Mention token
		print "* creating vector for " + str(len(mentionCounts.keys())) + " Mention types"
		sys.stdout.flush()

		vectors = {}
		sliceSize= 1+int(math.floor(float(len(mentionCounts.keys()))) / float(totalSlices))
		print "slice size: " + str(sliceSize)


		lower=(sliceNum-1)*sliceSize
		if sliceNum==totalSlices:
			upper=len(mentionCounts.keys())-1
		else:
			upper=sliceNum*sliceSize -1

		print "SLICENUM: " + str(sliceNum) + ": " + str(lower) + "," + str(upper)
		sys.stdout.flush()
		i=0
		for m in mentionCounts.keys():

			if i>= lower and i<=upper:
				print "mention " + str(i) + " across " + str(len(commonWordsCounts.keys())) + " commonWordsCounts"
				sys.stdout.flush()
				vec = []

				for c in commonWordsCounts.keys():
					if (m,c) in mentionAndCommonCounts.keys():
						vec.append(float(mentionAndCommonCounts[(m,c)]) / (float(commonWordsCounts[c]) * float(mentionCounts[m])))
					else:
						vec.append(0)
				vectors[m] = vec
			i = i + 1

		# SAVES VECTORS TO A PICKLE (SERIALIZED) FILE
		print "finished all Mention types' vectors; now writing them to disk"
		fileObj = open(outPickleFile, "wb")
		pickle.dump(vectors, fileObj)
		fileObj.close()

		print "finished!  now writing the stats file (the cosine sim. b/w every Mention type pairs"

		# WRITES STATS FILE
		ftrain = open(outputFile, 'w')
		# calculates the cosine sim. b/w every possible pair of vectors
		for v1 in vectors.keys():
			simScores = {}
			vec1 = vectors[v1]

			denom1 = 0
			for i in range(len(vec1)):
				denom1 = denom1 + math.pow(vec1[i], 2)
			denom1 = math.sqrt(denom1)

			for v2 in vectors.keys():
				if v1 != v2:
					vec2 = vectors[v2]
					num = 0
					for i in range(len(vec1)):
						num = num + (float(vec1[i]) * float(vec2[i]))
					
					denom2 = 0
					for i in range(len(vec2)):
						denom2 = denom2 + math.pow(vec2[i], 2)
					denom2 = math.sqrt(denom2)
					#print denom1
					#print denom2
					cosine = float(num + 0.0000001) / (0.0000001 + float(denom1) * float(denom2))
					simScores[v2] = cosine
			sorted_scores = sorted(simScores.items(), key=operator.itemgetter(1), reverse=True)
			ftrain.write("* " + str(v1) + ":\n")
			for x in sorted_scores:
				ftrain.write("\t" + str(x[0]) + " (" + str(x[1]) + ")\n")
		ftrain.close()