Exemple #1
0
	def __init__(self, *args, **kwargs):
		"""
		Initialization assumes that SENTTREE_PATH environment is set. 
		"""
		DocumentReader.__init__(self, *args, **kwargs)
		self.dbstring = os.environ["SENTTREE_DBSTRING"]
		self.postgres_recorder = PostgresDataRecorder(self.dbstring)
		self.folderPath = os.environ['SENTTREE_PATH']
Exemple #2
0
	def __init__(self,*args, **kwargs):
		"""
		Initialization assumes that IMDB_PATH environment is set. 
		To set in linux or mac: export IMDB_PATH=/some_directory_containing_IMDB_data
		"""
		DocumentReader.__init__(self, *args, **kwargs)
		self.dbstring = os.environ["IMDB_DBSTRING"]
		self.postgres_recorder = PostgresDataRecorder(self.dbstring)
		self.folderPath = os.environ['IMDB_PATH']
Exemple #3
0
    def __init__(self, *args, **kwargs):
        """
		It reads he environment variable and initializes the 
		base class. 
		"""
        DocumentReader.__init__(self, *args, **kwargs)
        self.dbstring = os.environ["REUTERS_DBSTRING"]
        self.postgres_recorder = PostgresDataRecorder(self.dbstring)
        self.folderPath = os.environ['REUTERS_PATH']
        self.validationDict = {}
Exemple #4
0
    def __init__(self, *args, **kwargs):
        """
		Initialization assumes that NEWSGROUP_PATH environment is set. 
		To set in linux or mac: export NEWSGROUP_PATH=/some_directory_containing_newsgroup_data
		"""
        DocumentReader.__init__(self, *args, **kwargs)
        self.dbstring = os.environ["NEWSGROUP_DBSTRING"]
        self.postgres_recorder = PostgresDataRecorder(self.dbstring)
        self.folderPath = os.environ['NEWSGROUP_PATH']
        self.validationDict = {}
        self.topic_names = []
Exemple #5
0
	def __init__(self,*args, **kwargs):
		"""
		It reads he environment variable and initializes the 
		base class. 
		"""
		DocumentReader.__init__(self, *args, **kwargs)
		self.dbstring = os.environ["DUC_DBSTRING"]
		self.postgres_recorder = PostgresDataRecorder(self.dbstring)
		self.folderPath = os.environ['DUC_PATH']
		self.processed_filenames = []
		self.processed_summaries = []
		self.lambda_val = os.environ['DUC_LAMBDA']
		self.diversity = os.environ['DUC_DIVERSITY']
		self.duc_topic = os.environ['DUC_TOPIC']
		self.document_id = 0
Exemple #6
0
class IMDBReader(DocumentReader):
	""" 
	IMDB Document Reader. Reads IMDB documents extracted from 
	: 
	"""

	def __init__(self,*args, **kwargs):
		"""
		Initialization assumes that IMDB_PATH environment is set. 
		To set in linux or mac: export IMDB_PATH=/some_directory_containing_IMDB_data
		"""
		DocumentReader.__init__(self, *args, **kwargs)
		self.dbstring = os.environ["IMDB_DBSTRING"]
		self.postgres_recorder = PostgresDataRecorder(self.dbstring)
		self.folderPath = os.environ['IMDB_PATH']
	
	def readTopic(self):
		"""
		"""
		rootDir = "%s/train" %self.folderPath
		return self._getTopics(rootDir)
	
	def readDocument(self, ld): 
		"""
		"""
		if ld <= 0: return 0 	
		self.postgres_recorder.trucateTables()
		self.postgres_recorder.alterSequences()
		topic_names = self.readTopic()
		
		
		document_id = 0
		for first_level_folder in next(os.walk(self.folderPath))[1]:
			if not(DocumentReader._folderISHidden(self, first_level_folder)):
				for topic in topic_names:					
					if first_level_folder == 'test' and topic == 'unsup':
						continue
					for file_ in os.listdir("%s%s%s%s%s" %(self.folderPath, "/", \
											first_level_folder, "/", topic)):
						doc_content = self._getTextFromFile("%s%s%s%s%s%s%s" \
							%(self.folderPath, "/", first_level_folder, "/", topic, "/", file_))
						
						document_id += 1
						title, metadata, istrain = None, None, None					
						try:
							trainortest = first_level_folder
							metadata = "SPLIT:%s"%trainortest
							istrain = 'YES' if trainortest.lower() == 'train' else 'NO'			
						except:
							Logger.logr.info("NO MetaData or Train Test Tag")
						self.postgres_recorder.insertIntoDocTable(document_id, title, \
									doc_content, file_, metadata) 
						category = topic.split('.')[0]
						self.postgres_recorder.insertIntoDocTopTable(document_id, \
									[topic], [category]) 		
						self._recordParagraphAndSentence(document_id, doc_content, self.postgres_recorder, topic, istrain)
					
					
		Logger.logr.info("Document reading complete.")
		return 1
	
	
	def runBaselines(self):
		"""
		"""
		latent_space_size = 300
		Logger.logr.info("Starting Running Para2vec (Doc) Baseline")
		# paraBaseline = Paragraph2VecSentenceRunner(self.dbstring)
		# paraBaseline.prepareData()
		# paraBaseline.runTheBaseline(latent_space_size)

		# Logger.logr.info("Starting Running Node2vec Baseline")
		# n2vBaseline = Node2VecRunner(self.dbstring)
		# n2vBaseline.prepareData()

		# paraBaseline.runEvaluationTask()
		# paraBaseline.runClassificationTask()
		
		#n2vBaseline.runTheBaseline(latent_space_size)

		#Logger.logr.info("Starting Running Iterative Update Method")
		#iterUdateBaseline = IterativeUpdateRetrofitRunner(self.dbstring)
		#iterUdateBaseline.prepareData()
		#iterUdateBaseline.runTheBaseline()
		
		#docBaseLine = Paragraph2VecRunner(self.dbstring)
		#docBaseLine.prepareData()
		#docBaseLine.runTheBaseline(latent_space_size)
		#docBaseLine.runEvaluationTask()
		#docBaseLine.runClassificationTask()

		docBaseLineCEXE = Paragraph2VecCEXERunner(self.dbstring)
		docBaseLineCEXE.prepareData()
		docBaseLineCEXE.runTheBaseline(latent_space_size)
		docBaseLineCEXE.runEvaluationTask()
		docBaseLineCEXE.runClassificationTask()
Exemple #7
0
class SentimentTreeBank2WayReader(DocumentReader):
	def __init__(self, *args, **kwargs):
		"""
		Initialization assumes that SENTTREE_PATH environment is set. 
		"""
		DocumentReader.__init__(self, *args, **kwargs)
		self.dbstring = os.environ["SENTTREE_DBSTRING"]
		self.postgres_recorder = PostgresDataRecorder(self.dbstring)
		self.folderPath = os.environ['SENTTREE_PATH']

	def readTopic(self):
		topic_names = ['pos', 'neg','unsup']
		categories = ['pos', 'neg', 'unsup']

		self.postgres_recorder.insertIntoTopTable(topic_names, categories)				
		Logger.logr.info("[%i] Topic reading complete." %(len(topic_names)))
		return topic_names

	def readDSplit(self,fileName):
		"""
		1 Train, 2 Test, 3 dev
		"""
		line_count = 0 
		dSPlitDict = {}
		for line in open(fileName, encoding='utf-8', errors='ignore'):
			if line_count == 0: 
				pass
			else:	
				doc_id,_, splitid = line.strip().partition(",")
				dSPlitDict[int(doc_id)] = int(splitid)
			line_count = line_count + 1

		Logger.logr.info("Finished reading %i sentences and their splits"%line_count)

		return dSPlitDict;

	def readSentences(self,fileName):
		line_count = 0
		sentenceDict = {}
		for line in open(fileName, encoding='utf-8', errors='ignore'):
			if line_count == 0:
				pass
			else:		
				doc_id,_,sentence = line.strip().partition("\t")
				sentenceDict[int(doc_id)] = sentence.strip()
			line_count = line_count + 1
		return sentenceDict
		Logger.logr.info("Finished reading %i sentence"%line_count)

	def phraseToSentiment(self, fileName):
		line_count = 0 
		phraseToSentimentDict = {}

		for line in open(fileName, encoding='utf-8', errors='ignore'):
			if line_count == 0:
				pass
			else:
				phrase_id,_, sentiment = line.strip().partition("|")
				phraseToSentimentDict[int(phrase_id)] = float(sentiment)
			line_count = line_count + 1
		return phraseToSentimentDict
		Logger.logr.info("Finished reading %i phrases"%line_count)

	def getTopicCategory(self, sentiment_val):
		"""
		[0, 0.2] very negative 
		(0.2, 0.4] negative 
		(0.4, 0.6] neutral 
		(0.6, 0.8] positive 
		(0.8, 1.0] very positive
		"""
		if sentiment_val <=0.4: 
			return ('neg', 'neg')
		elif sentiment_val >0.6:
			return ('pos', 'pos')
		else:
			return ('unsup', 'unsup')

	def readDocument(self, ld): 
		"""
		SKip neutral phrases 
		"""

		if ld <= 0: return 0 			
		self.postgres_recorder.trucateTables()
		self.postgres_recorder.alterSequences()
		topic_names = self.readTopic()

		allPhrasesFile = "%s/dictionary.txt"%(self.folderPath)
		dSPlitDict = self.readDSplit("%s/datasetSplit.txt"%self.folderPath)
		sentenceDict = self.readSentences("%s/datasetSentences.txt"%self.folderPath)
		phraseToSentimentDict = self.phraseToSentiment("%s/sentiment_labels.txt"%self.folderPath)

		for line in open(allPhrasesFile, encoding='utf-8', errors='ignore'):
				phrase, _ , phrase_id = line.strip().partition("|")
				contains_in_train, contains_in_test, contains_in_dev, is_a_sentence = False, False, False, False
				sentiment_val = phraseToSentimentDict[int(phrase_id)]			
				topic, category = self.getTopicCategory(sentiment_val)
				for sent_id, sentence in sentenceDict.items():
					if phrase in sentence: 
						train_label = dSPlitDict[sent_id]
						if train_label ==1:
							contains_in_train = True
						elif train_label==2:
							contains_in_test = True
						elif train_label==3:
							contains_in_dev = True 

					if phrase==sentence:
						is_a_sentence = True 
					
				#  all neutrals are considered as part of training   
				if sentiment_val >0.4 and sentiment_val<=0.6:
					metadata = "SPLIT:%s"%('unsup')
					istrain='MAYBE'
				elif contains_in_test==True and contains_in_train==False and\
					contains_in_dev==False and is_a_sentence==True:
					metadata = "SPLIT:%s"%('test')
					istrain ="NO"				
				elif contains_in_train ==True and contains_in_test==False and\
					contains_in_dev == False:
					metadata = "SPLIT:%s"%('train')
					istrain='YES'
				else:
					metadata = "SPLIT:%s"%('unsup')
					istrain='MAYBE'
					topic, category ='unsup', 'unsup'

				self.postgres_recorder.insertIntoDocTable(phrase_id, "", \
									phrase, "", metadata) 
				self.postgres_recorder.insertIntoDocTopTable(phrase_id, \
									[topic], [category])
				self._recordParagraphAndSentence(phrase_id, phrase,\
					self.postgres_recorder, topic, istrain)
	
		Logger.logr.info("Document reading complete.")
		return 1

	def runBaselines(self):
		"""
		"""
		latent_space_size = 300
		Logger.logr.info("Starting Running Para2vec (Doc) Baseline")
		# paraBaseline = Paragraph2VecSentenceRunner(self.dbstring)
		# paraBaseline.prepareData()
		# paraBaseline.runTheBaseline(latent_space_size)

		# Logger.logr.info("Starting Running Node2vec Baseline")
		# n2vBaseline = Node2VecRunner(self.dbstring)
		# n2vBaseline.prepareData()

		# paraBaseline.runEvaluationTask()
		# paraBaseline.runClassificationTask()
		
		#n2vBaseline.runTheBaseline(latent_space_size)

		#Logger.logr.info("Starting Running Iterative Update Method")
		#iterUdateBaseline = IterativeUpdateRetrofitRunner(self.dbstring)
		#iterUdateBaseline.prepareData()
		#iterUdateBaseline.runTheBaseline()
		
		#docBaseLine = Paragraph2VecRunner(self.dbstring)
		#docBaseLine.prepareData()
		#docBaseLine.runTheBaseline(latent_space_size)
		#docBaseLine.runEvaluationTask()
		#docBaseLine.runClassificationTask()

		docBaseLineCEXE = Paragraph2VecCEXERunner(self.dbstring)
		docBaseLineCEXE.prepareData()
		docBaseLineCEXE.runTheBaseline(latent_space_size)
		docBaseLineCEXE.runEvaluationTask()
		docBaseLineCEXE.runClassificationTask()
Exemple #8
0
class NewsGroupReader(DocumentReader):
    """ 
	News Group Document Reader.
	"""
    def __init__(self, *args, **kwargs):
        """
		Initialization assumes that NEWSGROUP_PATH environment is set. 
		To set in linux or mac: export NEWSGROUP_PATH=/some_directory_containing_newsgroup_data
		"""
        DocumentReader.__init__(self, *args, **kwargs)
        self.dbstring = os.environ["NEWSGROUP_DBSTRING"]
        self.postgres_recorder = PostgresDataRecorder(self.dbstring)
        self.folderPath = os.environ['NEWSGROUP_PATH']
        self.validationDict = {}
        self.topic_names = []

    def __stripNewsgroupHeader(self, text):
        """
	    Given text in "news" format, strip the headers, by removing everything
	    before the first blank line.
	    """
        _before, _blankline, after = text.partition('\n\n')
        return after

    def __stripNewsgroupQuoting(self, text):
        """
	    Given text in "news" format, strip lines beginning with the quote
	    characters > or |, plus lines that often introduce a quoted section
	    (for example, because they contain the string 'writes:'.)
	    """
        _QUOTE_RE = re.compile(r'(writes in|writes:|wrote:|says:|said:'
                               r'|^In article|^Quoted from|^\||^>)')

        good_lines = [
            line for line in text.split('\n') if not _QUOTE_RE.search(line)
        ]
        return '\n'.join(good_lines)

    def __stripNewsgroupFooter(self, text):
        """
	    Given text in "news" format, attempt to remove a signature block.
	    As a rough heuristic, we assume that signatures are set apart by either
	    a blank line or a line made of hyphens, and that it is the last such line
	    in the file (disregarding blank lines at the end).
	    """
        lines = text.strip().split('\n')
        for line_num in range(len(lines) - 1, -1, -1):
            line = lines[line_num]
            if line.strip().strip('-') == '':
                break

        if line_num > 0:
            return '\n'.join(lines[:line_num])
        else:
            return text

    def readTopic(self):
        """
		http://pythoncentral.io/how-to-traverse-a-directory-tree-in-python-guide-to-os-walk/
		"""

        rootDir = "%s/20news-bydate-train" % self.folderPath
        return self._getTopics(rootDir)

    def stripDocContent(self, doc_content):
        doc_content = self.__stripNewsgroupHeader(doc_content)
        doc_content = self.__stripNewsgroupFooter(doc_content)
        return self.__stripNewsgroupQuoting(doc_content)

    def __createValidationSet(self, document_ids):

        total_doc = len(document_ids)
        nvalid_doc = float(total_doc * 0.20)

        np.random.seed(2000)
        valid_list = np.random.choice(document_ids, nvalid_doc,
                                      replace=False).tolist()

        for id_ in valid_list:
            self.validationDict[id_] = 1

    def __readAPass(self, load=0):
        if load == 0:
            self.topic_names = self.readTopic()

        train_doc_ids = []
        document_id = 0
        for first_level_folder in os.listdir(self.folderPath):
            if not (DocumentReader._folderISHidden(self, first_level_folder)):
                for topic in self.topic_names:
                    if topic not in [
                            'talk.politics.mideast', 'comp.graphics',
                            'soc.religion.christian', 'rec.autos', 'sci.space',
                            'talk.politics.guns', 'rec.sport.baseball',
                            'sci.med'
                    ]:
                        continue
                    for file_ in os.listdir("%s%s%s%s%s" %(self.folderPath, "/", \
                          first_level_folder, "/", topic)):
                        doc_content = self._getTextFromFile("%s%s%s%s%s%s%s" \
                         %(self.folderPath, "/", first_level_folder, "/", topic, "/", file_))

                        doc_content = self.stripDocContent(doc_content)

                        document_id += 1
                        title, metadata, istrain = None, None, None
                        try:
                            trainortest = first_level_folder.split('-')[-1]
                            metadata = "SPLIT:%s" % trainortest
                            istrain = 'YES' if (trainortest.lower()
                                                == 'train') else 'NO'
                        except:
                            Logger.logr.info("NO MetaData or Train Test Tag")

                        if istrain == 'YES':
                            train_doc_ids.append(document_id)

                        if document_id in self.validationDict:
                            istrain = 'VALID'

                        if load == 1:
                            self.postgres_recorder.insertIntoDocTable(document_id, title, \
                               doc_content, file_, metadata)
                            category = topic.split('.')[0]
                            self.postgres_recorder.insertIntoDocTopTable(document_id, \
                               [topic], [category])
                            self._recordParagraphAndSentence(
                                document_id, doc_content,
                                self.postgres_recorder, topic, istrain)

        Logger.logr.info("A pass of the document reading complete.")
        return train_doc_ids

    def readDocument(self, ld):
        """
		Stripping is by default inactive. For future reference it has been 
		imported from scikit-learn newsgroup reader package. 

		
		"""
        if ld <= 0: return 0
        self.postgres_recorder.trucateTables()
        self.postgres_recorder.alterSequences()

        train_doc_ids = self.__readAPass(0)
        self.__createValidationSet(train_doc_ids)
        self.__readAPass(1)
        return 1

    def runBaselines(self, pd, rbase, gs):
        """
		"""
        #optDict = self._runClassificationOnValidation(pd, rbase, gs,"news")
        #self.doTesting(optDict, "news", rbase, pd, gs, True)
        optDict = self._runClusteringOnValidation(pd, rbase, gs, "news")
        self.doTesting(optDict, "news", rbase, pd, gs, False)
Exemple #9
0
class DUCReader(DocumentReader):
	""" 
	DUC Document Reader

	"""

	def __init__(self,*args, **kwargs):
		"""
		It reads he environment variable and initializes the 
		base class. 
		"""
		DocumentReader.__init__(self, *args, **kwargs)
		self.dbstring = os.environ["DUC_DBSTRING"]
		self.postgres_recorder = PostgresDataRecorder(self.dbstring)
		self.folderPath = os.environ['DUC_PATH']
		self.processed_filenames = []
		self.processed_summaries = []
		self.lambda_val = os.environ['DUC_LAMBDA']
		self.diversity = os.environ['DUC_DIVERSITY']
		self.duc_topic = os.environ['DUC_TOPIC']
		self.document_id = 0

	def readTopic(self):
		"""
		Recording DUC years as topics.
		"""
		topic_names = ['2001', '2002', '2003', '2004', '2005', '2006', '2007']
		categories = topic_names
		self.postgres_recorder.insertIntoTopTable(topic_names, categories)
		Logger.logr.info("Topic reading complete.")
	
	
	def recordDocuments(self, documents, topic, summaryFileDict):
		docFileDict = {}

		for document in documents:
			filename = document.split(os.path.sep)[-1] #ft923-5089
			if filename in self.processed_filenames: #don't store duplicate files
				continue
			if filename not in summaryFileDict:
				continue

			doc_content = self._getTextFromFile("%s" %(document))
			soup = BeautifulSoup(doc_content, "html.parser")
		
			try:
				doc_content = soup.find('text').text.strip()
			except:
				Logger.logr.info("%s %s" %(document, "Skipping. Cause, TEXT tag not found"))
				continue
			if doc_content.count('.') > 1000 or doc_content.count('.') < 1:
				Logger.logr.info("%s %s" %(document, "Skipping. Cause, %s sentences." %doc_content.count('.')))
				continue

			if len(doc_content.split()) < 100:
				continue


			self.processed_filenames += [filename]
			docFileDict [filename] = 1
			self.document_id += 1
			title, metadata, istrain = None, None, 'YES'


			self.postgres_recorder.insertIntoDocTable(self.document_id, title, \
						doc_content, filename, metadata) 
			category = topic.split('.')[0]
			self.postgres_recorder.insertIntoDocTopTable(self.document_id, \
						[topic], [category])
			self._recordParagraphAndSentence(self.document_id, doc_content, self.postgres_recorder, topic, istrain)
			
		return docFileDict
		
	def __recordSummariesA(self, summaries, document_dict):
		"""
		First check whether corresponding valid document is in 
		the database
		"""
		for summary in summaries:
			doc_content = self._getTextFromFile("%s" %(summary))
			soup = BeautifulSoup(doc_content, "html.parser")
			sums = soup.findAll('sum')

			for sum_ in sums:
				filename = sum_.get('docref')
				doc_content = sum_.text.strip()
				if filename not in document_dict:
					Logger.logr.info("Checking %s in document dict"%filename)
					continue

				method_id = 20 #DUC = 20
				summarizer = sum_.get('summarizer')
				metadata = "SUMMARIZER:%s" %(summarizer)
				if "%s%s" %(filename, summarizer) in self.processed_summaries:
					continue
				self.processed_summaries += ["%s%s" %(filename, summarizer)]
				self.postgres_recorder.insertIntoGoldSumTable(filename, doc_content, \
							method_id, metadata)

	def __getSummaryFileNames(self, summaryFile):
		doc_content = self._getTextFromFile("%s" %(summaryFile))
		soup = BeautifulSoup(doc_content, "html.parser")
		summaries = soup.findAll('sum')
		filenames = []

		for summary in summaries:
			filename = summary.get('docref')
			doc_content = summary.text
			if len(doc_content.split()) <100:
				continue
			else:
				filenames.append(filename)

		return filenames

	def __getValidSummaryFiles(self, summaries, summaryFileDict):
		
		for summary in summaries: 
			fileNames = self.__getSummaryFileNames(summary)
			for names in fileNames:
				summaryFileDict[names] = 1
		return summaryFileDict

	def __readDUC2001(self):
		"""
		It loads the DUC 2001 documents into
		the database. Check whether the number of words 
		in the summary is less than 100, if yes then discard. 
		As a rough heuristic, split the sentence and 
		then count number of words. The function also makes 
		sure that there will be no document without summary. 
		"""
		topic = "2001"
		cur_path = "%s/%s" %(self.folderPath, "DUC2001")

		# Go one pass to collect all valid summary file names
		summaries, documents =[], [] 
		for root, directories, files in os.walk(cur_path):
			documents += [os.path.join(root, file_) \
				for file_ in files if file_ not in  ['50', '100', '200', '400', 'perdocs']]
			summaries += [os.path.join(root, file_)\
				for file_ in files if file_ in "perdocs"]

		summaryFileDict = {}
		summaryFileDict = self.__getValidSummaryFiles(summaries, summaryFileDict)
		Logger.logr.info("Got %i documents and %i summaries"%(len(documents), len(summaryFileDict)))
		
		Logger.logr.info("Recording DUC 2001 Documents.")
		docFileDict = self.recordDocuments(documents, topic, summaryFileDict)
		Logger.logr.info("%i elements in summary dict and %i"\
		 " elements in doc dict"%(len(summaryFileDict), len(docFileDict)))
		Logger.logr.info("Recording DUC 2001 Summaries.")
		self.__recordSummariesA(summaries, docFileDict)
		
		
	def __readDUC2002(self):
		"""
		It loads the DUC 2002 documents into
		the database. Check whether the number of words 
		in the summary is less than 100, if yes then discard. 
		As a rough heuristic, split the sentence and 
		then count. The function also makes sure there will 
		be no document without summary. 
		"""
		topic = "2002"
		cur_path = "%s/%s" %(self.folderPath, "DUC2002")

		# Go one pass to collect all valid summary file names
		summaries, documents =[], [] 
		for root, directories, files in os.walk(cur_path):
			documents += [os.path.join(root, file_) \
				for file_ in files if file_ not in  ['10', '50', '100', '200', '400', '200e', '400e', 'perdocs']]
			summaries += [os.path.join(root, file_)\
				for file_ in files if file_ in "perdocs"]
 
		summaryFileDict = {}
		summaryFileDict = self.__getValidSummaryFiles(summaries, summaryFileDict)
		Logger.logr.info("Got %i documents and %i summaries"%(len(documents), len(summaryFileDict)))
		
		Logger.logr.info("Recording DUC 2002 Documents.")
		docFileDict = self.recordDocuments(documents, topic, summaryFileDict)
		Logger.logr.info("%i elements in summary dict and %i"\
		 " elements in doc dict"%(len(summaryFileDict), len(docFileDict)))
		Logger.logr.info("Recording DUC 2002 Summaries.")
		self.__recordSummariesA(summaries, docFileDict)
		
		
	def readDocument(self, ld): 
		if ld <= 0: return 0 
		self.postgres_recorder.trucateTables()
		self.postgres_recorder.truncateSummaryTable()
		self.postgres_recorder.alterSequences()
		self.readTopic()
		
		document_id = 0
		if self.duc_topic == str(2001):
			self.__readDUC2001()
		else:
		    self.__readDUC2002()
		# document_id = self._readDUC2003(document_id)
		# document_id = self._readDUC2004(document_id)
		# document_id = self._readDUC2005(document_id)
		# document_id = self._readDUC2006(document_id)
		# document_id = self._readDUC2007(document_id)


	def __runSpecificEvaluation(self, models = [20], systems = []):
		rougeInstance = Rouge()
		rPDict = rougeInstance.buildRougeParamDict()
		rPDict['-l'] = str(100)
		rPDict['-c'] = str(0.99)

		evaluation = RankingEvaluation(topics = [self.duc_topic], models = models, systems = systems)
		evaluation._prepareFiles()
		evaluation._getRankingEvaluation(rPDict, rougeInstance)

		rPDict['-l'] = str(10)
		evaluation._getRankingEvaluation(rPDict, rougeInstance)
	
	
	def __runCombinedEvaluation(self):
		rougeInstance = Rouge()
		rPDict = rougeInstance.buildRougeParamDict()
		rPDict['-l'] = str(100)
		rPDict['-c'] = str(0.99)

		evaluation = RankingEvaluation(topics = [self.duc_topic], models = [20], systems = [1,2,3,4,5,6,7,9,10,11,12,21])
		evaluation._prepareFiles()
		evaluation._getRankingEvaluation(rPDict, rougeInstance)

		rPDict['-l'] = str(10)
		evaluation._getRankingEvaluation(rPDict, rougeInstance)
		
		
	def __getRecall(self, method_id, models, systems):
		output_file_name = ""
		for model in models:
			output_file_name += str(model)+"_"
		for system in systems:
			output_file_name += "_"+str(system)
		output_file_name += "_output"
		output_file_name += "_%s.txt" %(str(10))
		
		with open('%s%s%s' %(os.environ["SUMMARYFOLDER"],"/",output_file_name), 'r') as f:
			content = f.read()
			recall = float(content.split("%s ROUGE-1 Average_R: " %method_id)[1].split(' ')[0])
		return recall
	
	
	def runBaselines(self, pd, rbase, gs):
		"""
		"""

############# Validation ############################		
		with open('%s%s%s%s' %(os.environ["TRTESTFOLDER"],"/",self.duc_topic,"_hyperparameters.txt"), 'w') as f:
			
			latent_space_size = 300

			diversity = False
			if self.diversity == str(1):
				diversity = True 

			# createValidationSet() Need to implement this function
			os.environ['DUC_EVAL']='VALID'
	
			recalls = {}
			window_opt = None #var for the optimal window
			for window in ["8", "10", "12"]:
			#for window in ["8"]:
				Logger.logr.info("Starting Running Para2vec Baseline for Window = %s" %window)
				self.postgres_recorder.truncateSummaryTable()
				paraBaseline = P2VSENTCExecutableRunner(self.dbstring)
				if 	window=="8":  
					paraBaseline.prepareData(pd)
				paraBaseline.runTheBaseline(rbase,latent_space_size, window)
				paraBaseline.generateSummary(gs,\
					lambda_val=self.lambda_val, diversity=diversity)
				paraBaseline.doHouseKeeping()			
				self.__runSpecificEvaluation(models = [20], systems = [2]) #Running Rouge for method_id = 2 only
				recalls[window] = self.__getRecall(method_id=2, models = [20], systems = [2])
				Logger.logr.info("Recall for %s = %s" %(window, recalls[window]))
			window_opt = max(recalls, key=recalls.get) #get the window for the max recall
			f.write("Optimal window size is %s%s"%(window_opt, os.linesep))
			f.write("P2V Window Recalls: %s%s" %(recalls, os.linesep))
			f.flush()

			Logger.logr.info("Starting Running Para2vec Baseline for Optimal Window = %s" %window_opt)
			self.postgres_recorder.truncateSummaryTable()
			paraBaseline = P2VSENTCExecutableRunner(self.dbstring)
			paraBaseline.runTheBaseline(rbase,latent_space_size, window_opt) #we need the p2v vectors created with optimal window
			paraBaseline.doHouseKeeping()
	
			recalls = {}
			beta_opt = None #var for the optimal beta
			for beta in ["0.3", "0.6", "0.9","1.0"]:
			#for beta in ["0.3"]:
				Logger.logr.info("Starting Running Node2vec Baseline for Beta = %s" %beta)
				self.postgres_recorder.truncateSummaryTable()
				n2vBaseline = Node2VecRunner(self.dbstring)
				n2vBaseline.mybeta = beta #reinitializing mybeta
				generate_walk = False
				if beta=="0.3":
				   n2vBaseline.prepareData(pd)
				   generate_walk = True 
				n2vBaseline.runTheBaseline(rbase, latent_space_size, generate_walk)
				n2vBaseline.generateSummary(gs, 5, "_retrofit",\
					 lambda_val=self.lambda_val, diversity=diversity)
				n2vBaseline.doHouseKeeping()
				self.__runSpecificEvaluation(models = [20], systems = [5]) #Running Rouge for method_id = 5 only
				recalls[beta] = self.__getRecall(method_id=5, models = [20], systems = [5])
				Logger.logr.info("Recall for %s = %s" %(beta, recalls[beta]))
			beta_opt = max(recalls, key=recalls.get) #get the beta for the max recall
			f.write("Optimal Beta is %s%s"%(beta_opt, os.linesep))
			f.write("N2V Beta Recalls: %s%s" %(recalls, os.linesep))
			f.flush()
	
			recalls = {}
			alpha_opt = None #var for the optimal beta
			for alpha in [0.3, 0.6, 0.8, 1.0]:
			#for alpha in [0.3]:
				Logger.logr.info("Starting Running Iterative Baseline for Alpha = %s" %alpha)
				self.postgres_recorder.truncateSummaryTable()
				iterrunner = IterativeUpdateRetrofitRunner(self.dbstring)
				iterrunner.myalpha = alpha #reinitializing myalpha
				if alpha==0.3:
					iterrunner.prepareData(pd)
				iterrunner.runTheBaseline(rbase)
				iterrunner.generateSummary(gs, 7, "_weighted",\
					lambda_val=self.lambda_val, diversity=diversity)
				iterrunner.doHouseKeeping()
				self.__runSpecificEvaluation(models = [20], systems = [7])
				recalls[alpha] = self.__getRecall(method_id=7, models = [20], systems = [7])
				Logger.logr.info("Recall for %s = %s" %(alpha, recalls[alpha]))
			alpha_opt = max(recalls, key=recalls.get) #get the alpha for the max recall
			Logger.logr.info("Optimal Alpha=%s" %alpha_opt)
			f.write("Optimal alpha is %.2f%s"%(alpha_opt, os.linesep))
			f.write("ITR Alpha Recalls: %s%s" %(recalls, os.linesep))
			f.flush()

			w_recalls = {}
			unw_recalls = {}
			w_opt = None
			unw_opt = None
			for beta in [0.3, 0.6, 0.8, 1.0]:
			#for beta in [0.3]:
				Logger.logr.info("Starting Running Regularized Baseline for Beta = %s" %beta)
				self.postgres_recorder.truncateSummaryTable()
				regs2v = RegularizedSen2VecRunner(self.dbstring)
				regs2v.regBetaW = beta
				regs2v.regBetaUNW = beta
				if beta==0.3:
					regs2v.prepareData(pd)
				regs2v.runTheBaseline(rbase, latent_space_size)
				regs2v.generateSummary(gs,9,"_neighbor_w",\
					 lambda_val=self.lambda_val, diversity=diversity)
				regs2v.generateSummary(gs,10,"_neighbor_unw",\
					 lambda_val=self.lambda_val, diversity=diversity)
				regs2v.doHouseKeeping()
				self.__runSpecificEvaluation(models = [20], systems = [9, 10])
				w_recalls[beta] = self.__getRecall(method_id=9, models = [20], systems = [9, 10])
				unw_recalls[beta] = self.__getRecall(method_id=10, models = [20], systems = [9, 10])
				Logger.logr.info("W_Recall for %s = %s" %(beta, w_recalls[beta]))
				Logger.logr.info("UNW_Recall for %s = %s" %(beta, unw_recalls[beta]))
			w_opt_reg = max(w_recalls, key=w_recalls.get)
			unw_opt_reg = max(unw_recalls, key=unw_recalls.get)
			Logger.logr.info("Optimal regBetaW=%s and regBetaUNW=%s" %(w_opt_reg, unw_opt_reg))

			f.write("Optimal REG BetaW : %.2f%s" %(w_opt_reg, os.linesep))
			f.write("Optimal REG BetaUNW : %.2f%s" %(unw_opt_reg, os.linesep))
			f.write("REG BetaW Recalls: %s%s" %(w_recalls, os.linesep))
			f.write("REG BetaUNW Recalls: %s%s" %(unw_recalls, os.linesep))
			f.flush()

			w_recalls = {}
			unw_recalls = {}
			w_opt = None
			unw_opt = None
			for beta in [0.3, 0.6, 0.8, 1.0]:
			#for beta in [0.3]:
				Logger.logr.info("Starting Running Dict Regularized Baseline for Beta = %s" %beta)
				self.postgres_recorder.truncateSummaryTable()
				dictregs2v = DictRegularizedSen2VecRunner(self.dbstring)
				dictregs2v.dictregBetaW = beta
				dictregs2v.dictregBetaUNW = beta
				if beta==0.3:
					dictregs2v.prepareData(pd)
				dictregs2v.runTheBaseline(rbase, latent_space_size)
				dictregs2v.generateSummary(gs,11,"_neighbor_w",\
					 lambda_val=self.lambda_val, diversity=diversity)
				dictregs2v.generateSummary(gs,12,"_neighbor_unw",\
					 lambda_val=self.lambda_val, diversity=diversity)
				dictregs2v.doHouseKeeping()
				self.__runSpecificEvaluation(models = [20], systems = [11, 12])
				w_recalls[beta] = self.__getRecall(method_id=11, models = [20], systems = [11, 12])
				unw_recalls[beta] = self.__getRecall(method_id=12, models = [20], systems = [11, 12])
				Logger.logr.info("W_Recall for %s = %s" %(beta, w_recalls[beta]))
				Logger.logr.info("UNW_Recall for %s = %s" %(beta, unw_recalls[beta]))
			w_opt_dict_reg = max(w_recalls, key=w_recalls.get)
			unw_opt_dict_reg = max(unw_recalls, key=unw_recalls.get)
			Logger.logr.info("Optimal dictregBetaW=%s and dictregBetaUNW=%s" %(w_opt_dict_reg, unw_opt_dict_reg))

			f.write("DCT BetaW: %.2f%s" %(w_opt_dict_reg, os.linesep))
			f.write("DCT BetaUNW: %.2f%s" %(unw_opt_dict_reg, os.linesep))
			f.write("DCT BetaW Recalls: %s%s" %(w_recalls, os.linesep))
			f.write("DCT BetaUNW Recalls: %s%s" %(unw_recalls, os.linesep))
			f.flush()


######## Test ########################################
			os.environ["DUC_EVAL"]='TEST'

			niter = 5
			for i in range(0,niter):
				f.write("###### Iteration: %s ######%s" %(i, os.linesep))
				f.write("Optimal Window: %s%s" %(window_opt, os.linesep))				
				self.postgres_recorder.truncateSummaryTable()
				paraBaseline = P2VSENTCExecutableRunner(self.dbstring)
				paraBaseline.runTheBaseline(rbase,latent_space_size, window_opt) #we need the p2v vectors created with optimal window
				paraBaseline.generateSummary(gs,\
						lambda_val=self.lambda_val, diversity=diversity)
				paraBaseline.doHouseKeeping()
				f.flush()


				f.write("Optimal Beta: %s%s" %(beta_opt, os.linesep))	
				n2vBaseline = Node2VecRunner(self.dbstring)
				n2vBaseline.mybeta = beta_opt
				generate_walk = False
				n2vBaseline.runTheBaseline(rbase, latent_space_size, generate_walk)
				n2vBaseline.generateSummary(gs, 3, "",\
					 lambda_val=self.lambda_val, diversity=diversity)
				n2vBaseline.generateSummary(gs, 4, "_init",\
					 lambda_val=self.lambda_val, diversity=diversity)
				n2vBaseline.generateSummary(gs, 5, "_retrofit",\
					 lambda_val=self.lambda_val, diversity=diversity)
				n2vBaseline.doHouseKeeping()
				f.flush()


				f.write("Optimal alpha: %.2f%s" %(alpha_opt, os.linesep))	
				iterrunner = IterativeUpdateRetrofitRunner(self.dbstring)
				iterrunner.myalpha = alpha_opt #reinitializing myalpha
				iterrunner.runTheBaseline(rbase)
				iterrunner.generateSummary(gs, 6, "_unweighted",\
						lambda_val=self.lambda_val, diversity=diversity)
				iterrunner.generateSummary(gs, 7, "_weighted",\
						lambda_val=self.lambda_val, diversity=diversity)
				iterrunner.doHouseKeeping()


				f.write("Optimal regBetaW: %.2f%s" %(w_opt_reg, os.linesep))	
				f.write("Optimal regBetaUNW: %.2f%s" %(unw_opt_reg, os.linesep))	
				regs2v = RegularizedSen2VecRunner(self.dbstring)
				regs2v.regBetaW = w_opt_reg 
				regs2v.regBetaUNW = unw_opt_reg
				regs2v.runTheBaseline(rbase, latent_space_size)
				regs2v.generateSummary(gs,9,"_neighbor_w",\
					 lambda_val=self.lambda_val, diversity=diversity)
				regs2v.generateSummary(gs,10,"_neighbor_unw",\
					 lambda_val=self.lambda_val, diversity=diversity)
				regs2v.doHouseKeeping()
				f.flush()


				f.write("Optimal regBetaW: %.2f%s" %(w_opt_dict_reg, os.linesep))	
				f.write("Optimal regBetaUNW: %.2f%s" %(unw_opt_dict_reg, os.linesep))	
				dictregs2v = DictRegularizedSen2VecRunner(self.dbstring)
				dictregs2v.dictregBetaW = w_opt_dict_reg
				dictregs2v.dictregBetaUNW = unw_opt_dict_reg
				dictregs2v.runTheBaseline(rbase, latent_space_size)
				dictregs2v.generateSummary(gs,11,"_neighbor_w",\
					 lambda_val=self.lambda_val, diversity=diversity)
				dictregs2v.generateSummary(gs,12,"_neighbor_unw",\
					 lambda_val=self.lambda_val, diversity=diversity)
				dictregs2v.doHouseKeeping()
				f.flush()
				
				self.__runCombinedEvaluation()

				#20__1_2_3_4_5_6_7_9_10_11_12_21_output_100.txt
				#20__1_2_3_4_5_6_7_9_10_11_12_21_output_10.txt
				f.write ("%s%s"%("#########################Running for Test (100) ###########################################", os.linesep))
				file_ = os.path.join(os.environ["SUMMARYFOLDER"],"20__1_2_3_4_5_6_7_9_10_11_12_21_output_100.txt")
				for line in open(file_):
					f.write(line)
				f.flush()

				f.write ("%s%s"%("#########################Running for Test (10) ###########################################", os.linesep))
				file_ = os.path.join(os.environ["SUMMARYFOLDER"], "20__1_2_3_4_5_6_7_9_10_11_12_21_output_10.txt")
				for line in open(file_):
					f.write(line)

				f.write("%s%s"%(os.linesep, os.linesep))
				f.flush()
Exemple #10
0
class ReutersReader(DocumentReader):
    """ 
	Reuters Document Reader

	"""
    def __init__(self, *args, **kwargs):
        """
		It reads he environment variable and initializes the 
		base class. 
		"""
        DocumentReader.__init__(self, *args, **kwargs)
        self.dbstring = os.environ["REUTERS_DBSTRING"]
        self.postgres_recorder = PostgresDataRecorder(self.dbstring)
        self.folderPath = os.environ['REUTERS_PATH']
        self.validationDict = {}

    def __recordDocumentTopic(self, document_id, doc):
        """

		"""
        topic_names = []
        categories = []

        possible_categories = [
            "topics", "places", "people", "orgs", "exchanges", "companies"
        ]

        for category in possible_categories:
            try:
                topics = doc.find(category).findAll('d')
                for topic in topics:
                    topic = topic.text.strip()
                    topic_names += [topic]
                    categories += [category]
            except:
                pass

        self.postgres_recorder.insertIntoDocTopTable(document_id,\
           topic_names, categories)

    def readTopic(self):
        """
		"""
        topic_names = []
        categories = []
        for file_ in os.listdir(self.folderPath):
            if file_.endswith(".lc.txt"):
                category = file_.split('-')[1]
                content = open("%s%s%s" % (self.folderPath, "/", file_),
                               'r',
                               encoding='utf-8',
                               errors='ignore').read()
                for topic in content.split(os.linesep):
                    topic = topic.strip()
                    if len(topic) != 0:
                        topic_names += [topic]
                        categories += [category]

        self.postgres_recorder.insertIntoTopTable(topic_names, categories)
        Logger.logr.info("Topic reading complete.")

    def _getTopic(self, document_id, doc):
        """
		Interested topic: acq, money-fx, crude, trade, interest. 
		A topic can be one of the interested topic. A topic 
		is assigned based on the order if multiple interested topics 
		are assigned for a particular document. We take top-10 
		frequent topics mentioned in "Text Categorization with support 
		vector machines: Learning with many relevant features."
		"""
        interested_topic_list = ['earn', 'acq', 'money-fx', 'grain', 'crude', 'trade'\
         ,'interest', 'ship', 'wheat', 'corn']

        topics = doc.find("topics").findAll('d')
        for topic in topics:
            topic = topic.text.strip()
            if topic in interested_topic_list:
                return topic

        return "other"

    def __createValidationSet(self, document_ids):

        total_doc = len(document_ids)
        nvalid_doc = float(total_doc * 0.20)

        np.random.seed(2000)
        valid_list = np.random.choice(document_ids, nvalid_doc,
                                      replace=False).tolist()

        for id_ in valid_list:
            self.validationDict[id_] = 1

    def __readAPass(self, load):
        self.postgres_recorder.trucateTables()
        self.postgres_recorder.alterSequences()
        self.readTopic()

        train_doc_ids = []
        for file_ in os.listdir(self.folderPath):
            if file_.endswith(".sgm"):
                file_content = self._getTextFromFile(
                    "%s%s%s" % (self.folderPath, "/", file_))
                soup = BeautifulSoup(file_content, "html.parser")
                for doc in soup.findAll('reuters'):
                    document_id = doc['newid']
                    title = doc.find('title').text if doc.find('title') \
                       is not None else None
                    doc_content = doc.find('text').text if doc.find('text')\
                       is not None else None
                    try:
                        metadata = "OLDID:"+doc['oldid']+"^"+"TOPICS:"+doc['topics']+\
                        "^"+"CGISPLIT:"+doc['cgisplit']+"^"+"LEWISSPLIT:"+doc['lewissplit']

                        if doc['lewissplit'] == "NOT-USED" or doc['topics'] == "NO"\
                        or doc['topics'] == "BYPASS" :
                            Logger.logr.info(
                                "SKipping because of ModApte Split")
                            continue
                    except:
                        metadata = None
                        continue
                    topic = self._getTopic(document_id, doc)

                    if topic in ['wheat', 'corn', 'other']:
                        continue
                    #if topic not in ['ship','interest']:
                    #	continue

                    istrain = 'YES' if doc['lewissplit'].lower(
                    ) == 'train' else 'NO'
                    if document_id in self.validationDict:
                        istrain = 'VALID'

                    if istrain == 'YES':
                        train_doc_ids.append(document_id)

                    if load == 0:
                        continue
                    self.postgres_recorder.insertIntoDocTable(document_id, title, \
                       doc_content, file_, metadata)
                    self.__recordDocumentTopic(document_id, doc)
                    self._recordParagraphAndSentence(document_id, doc_content,
                                                     self.postgres_recorder,
                                                     topic, istrain)

        return train_doc_ids
        Logger.logr.info("[Pass 1] Document reading complete.")

    def readDocument(self, ld):
        """
		First, reading and recording the Topics. Second, recording each document at a time	
		Third, for each document, record the lower level information 
		like: paragraph, sentences in table 
		"""
        if ld <= 0: return 0
        train_doc_ids = self.__readAPass(0)
        self.__createValidationSet(train_doc_ids)
        self.__readAPass(1)
        return 1

    def runBaselines(self, pd, rbase, gs):
        """
		"""
        optDict = self._runClassificationOnValidation(pd, rbase, gs, "reuter")
        self.doTesting(optDict, "reuter", rbase, pd, gs, True)
        optDict = self._runClusteringOnValidation(pd, rbase, gs, "reuter")
        self.doTesting(optDict, "reuter", rbase, pd, gs, False)