Python DBController.saveArticle Examples

Programming Language: Python

Namespace/Package Name: DBController

Class/Type: DBController

Method/Function: saveArticle

Examples at hotexamples.com: 3

Python DBController.saveArticle - 3 examples found. These are the top rated real world Python examples of DBController.DBController.saveArticle extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

DBController(17)

add_document(3)

get_all_articles(3)

saveArticle(3)

get_article(2)

getFeatureListByWeek(2)

getEngagerById(2)

getCompanyById(2)

getAllUserList(2)

getAllSentence(2)

not_updated_list(2)

create_table(2)

commit_n_close(2)

select_data_single(2)

getTop50Rank(2)

update_isdownload(2)

update_app(2)

updateCompanyCEO(1)

insertMTVReviewToDB(1)

member_activities(1)

isFeatureInDB(1)

update_activity(1)

insert_data(1)

insertTop50ChartToDB(1)

update_admin(1)

insertSpeech(1)

insertSession(1)

insertSentence(1)

insertSalesChartToDB(1)

update_data(1)

update_is_downloaded(1)

insertIMVDBDataToDB(1)

member_login(1)

insertFeatureToDB(1)

insertEngager(1)

insertConference(1)

insertCompany(1)

insertCommentToDB(1)

get_old_category_app_list(1)

get_all_documents(1)

update_member(1)

getYoutubeData(1)

getUserCount(1)

getUserById(1)

getUnprocessedArticleInBatch(1)

member_is_present(1)

member_status_in_activity(1)

storeEntry(1)

updateSentencePfm(1)

updateDietHistory(1)

Example #1

Show file

File: FileUtils.py Project: exsonic/CorpusAnalysis

def loadAllRTFToDB(folderPath):
	db = DBController()
	for dirPath, dirNames, fileNames in os.walk(folderPath):
		for fileName in fileNames:
			if not fileName.endswith('.rtf'):
				continue
			filePath = os.path.join(dirPath, fileName)
			print(filePath)
			try:
				doc = Rtf15Reader.read(open(filePath))
				text = PlaintextWriter.write(doc).getvalue()
			except:
				continue
			lines = [line.strip() for line in text.split('\n') if line]
			articleLinesDict, articleStartIndex = {}, 0
			for i, line in enumerate(lines):
				if line.startswith('Document ') and len(line.split(' ')) == 2:
					articleId = line.split(' ')[-1]
					articleLinesDict[articleId] = lines[articleStartIndex : i]
					articleStartIndex = i + 1

			for articleId, lines in articleLinesDict.iteritems():
				bylineIndex, wordCountIndex, textStartIndex = -1, -1, -1
				for i, line in enumerate(lines):
					line = line.lower()
					if line.startswith('by '):
						bylineIndex = i
					elif line.endswith(' words'):
						wordCountIndex = i
					elif line == 'english':
						textStartIndex = i + 2

				if wordCountIndex == -1 or textStartIndex == -1 or wordCountIndex > textStartIndex:
					print(filePath + ', ' + articleId)
				else:
					articleDict = {'_id': articleId,
					               'filePath' : filePath.split('Marshall_RA/')[-1],
					               'headline': ' '.join(lines[: wordCountIndex]) if bylineIndex == -1 else ' '.join(lines[: bylineIndex]),
					               'byline' : '' if bylineIndex == -1 else lines[bylineIndex],
					               'date' : parser.parse(lines[wordCountIndex + 1]),
					               'sourceName' : lines[wordCountIndex + 2] if lines[wordCountIndex + 2].find(' AM') == -1 and lines[wordCountIndex + 2].find(' PM') == -1 else lines[wordCountIndex + 3],
					               'leadParagraph' : '',
					               'tailParagraph' : '\n'.join(lines[textStartIndex:]),
					               'sourceCode' : '', 'industry' : [], 'region' : [], 'newsSubject' : [], 'company' : []}
					db.saveArticle(articleDict)

Example #2

Show file

File: FileUtils.py Project: exsonic/CorpusAnalysis

def loadAllXMLtoDB(inputDir):
	#have folder and p, pa info, insert after get
	db = DBController()
	for dirName, _, fileNames in os.walk(inputDir):
		print(dirName)
		for fileName in fileNames:
			try:
				if not fileName.endswith('xml'):
					continue
				fileAbsPath = getAbsPath(dirName, fileName)
				for articleDict in parseArticleFromXML(fileAbsPath):
					#duplication check
					# if db.isArticleDuplicate(articleDict['tailParagraph']):
					# 	continue
					articleDict['filePath'] = fileAbsPath.split('Marshall_RA/')[1]
					db.saveArticle(articleDict)
			except Exception as e:
				print e, dirName, fileName

Example #3

Show file

File: DataExporter.py Project: exsonic/CorpusAnalysis

class DataProcessorThread(Thread):
	def __init__(self, taskQueue, resultQueue, *args):
		super(DataProcessorThread, self).__init__()
		self._taskQueue = taskQueue
		self._resultQueue = resultQueue
		self._args = args
		self._executeFunction = None

		self._db = DBController()
		self._citeWordList = getWordList(WORD_CITE)
		if not os.path.exists('export/'):
			os.makedirs('export/')

	def exportSentenceAnalysis(self):
		#sentence collection is all the sentence
		#deprecated, need to refactor and apply queue
		with open('export/sentence.csv', 'wb') as f:
			writer = csv.writer(f)
			sentences = self._db.getAllSentence()
			articleDict = {}
			attributeList = ['id', 'cotic', 'coname', 'filePath', 'accessionNo', 'content', 'coname','ceoname', 'cite',
			                 'co_c', 'ceo_c', 'analyst_c', 'pfm', 'pfm_words', 'pos', 'pos_words', 'neg', 'neg_words',
			                 'internal', 'int_words', 'external', 'ext_words',
			                 'quote_sen', 'analyst']
			writer.writerow(attributeList)
			for i, sentence in enumerate(sentences):
				try:
					print(i)
					if sentence['articleId'] not in articleDict:
						articleDict[sentence['articleId']] = self._db.getArticleById(sentence['articleId'])
					article = articleDict[sentence['articleId']]
					articlePathPartList = article['filePath'].split('/')
					articleCompanyCode = articlePathPartList[-3] if articlePathPartList[-2] == 'a' else articlePathPartList[-2]
					articleCompany = self._db.getCompanyByCode(articleCompanyCode)
					articleCompanyName = articleCompanyCode if articleCompany is None else articleCompany['name']
					sentenceCompanyList = [self._db.getCompanyById(companyId) for companyId in sentence['company']]
					sentenceCompanyNameString = ','.join([company['shortName'] for company in sentenceCompanyList])
					sentenceEngagerList = [self._db.getEngagerById(engagerId) for engagerId in sentence['engager']]
					CEOList = filter(lambda engager : engager['type'] == ENGAGER_CEO, sentenceEngagerList)
					analystList =  filter(lambda engager : engager['type'] == ENGAGER_ANALYST, sentenceEngagerList)
					CEONameString = ','.join([CEO['lastName'] for CEO in CEOList])
					citeWordString = ','.join(sentence['cite'])
					citeCompany, citeCEO, citeAnalyst = int(sentence['citeCompany']), int(sentence['citeCEO']), int(sentence['citeAnalyst'])
					pfmWordString = ','.join(sentence['pfm'])
					posWordString = ','.join(sentence['pos'])
					negWordString = ','.join(sentence['neg'])
					inWordString = ','.join(sentence['in'])
					exWordString = ','.join(sentence['ex'])
					quoteString = getQuotedString(sentence['content'])
					analystSurroundString = getStringSurroundWordInDistance(sentence['content'], 'analyst', ANALYST_SURROUND_DISTANCE)
					lineList = [sentence['_id'], articleCompanyCode, articleCompanyName, article['filePath'], article['_id'], sentence['content'].encode('utf-8'),
					            sentenceCompanyNameString, CEONameString, citeWordString, citeCompany, citeCEO, citeAnalyst,
					            len(sentence['pfm']), pfmWordString, len(sentence['pos']), posWordString, len(sentence['neg']), negWordString,
					            len(sentence['in']), inWordString, len(sentence['ex']), exWordString,
					            quoteString, analystSurroundString]

					writer.writerow(lineList)
				except Exception as e:
					print(e)

	def exportArticleAnalysis(self):
		#deprecated
		with open('export/article.csv', 'wb') as f:
			writer = csv.writer(f)
			articleList = list(self._db.getAllArticle())
			attributeList = ['cotic', 'coname', 'filePath', 'accessNo', 'date', 'source', 'byline',
			                 'coname1', 'coname2', 'coname3', 'coname4', 'coname5',
			                 'subjectCode1', 'subjectCode2', 'subjectCode3', 'subjectCode4', 'subjectCode5']
			writer.writerow(attributeList)
			for i, article in enumerate(articleList):
				try:
					print(i)
					articlePathPartList = article['filePath'].split('/')
					articleCompanyCode = articlePathPartList[-3] if articlePathPartList[-2] == 'a' else articlePathPartList[-2]
					articleCompany = self._db.getCompanyByCode(articleCompanyCode)
					articleCompanyName = articleCompanyCode if articleCompany is None else articleCompany['name']
					companyCodeList = [''] * ARTICLE_EXPORT_CODE_SIZE
					subjectCodeList = [''] * ARTICLE_EXPORT_CODE_SIZE
					if 'company' in article:
						for i, companyCode in enumerate(article['company']):
							if i >= ARTICLE_EXPORT_CODE_SIZE:
								break
							companyCodeList[i] = companyCode
					else:
						article['company'] = [articleCompanyCode]
						companyCodeList = article['company']

					if 'newsSubject' in article:
						for i, subjectCode in enumerate(article['newsSubject']):
							if i >= ARTICLE_EXPORT_CODE_SIZE:
								break
							subjectCodeList[i] = subjectCode
					else:
						article['newsSubject'] = []
						subjectCodeList = article['newsSubject']

					self._db.saveArticle(article)

					lineList = [articleCompanyCode, articleCompanyName, article['filePath'], article['_id'], article['date'], article['sourceName'], article['byline']] + companyCodeList + subjectCodeList
					writer.writerow(lineList)
				except Exception as e:
					print(e)

	def processKeywordSearch(self):
		searchString = self._args[0]
		while True:
			article = self._taskQueue.get()
			if article == END_OF_QUEUE:
				break
			else:
				articlePathPartList = article['filePath'].split('/')
				articleCompanyCode = articlePathPartList[-3] if articlePathPartList[-2] == 'a' else articlePathPartList[-2]
				articleCompany = self._db.getCompanyByCode(articleCompanyCode)
				articleCompanyName = articleCompanyCode if articleCompany is None else articleCompany['name']
				articleSentenceList = []

				#here, use '|' to combine regex is OK, because sentence is short, will not reduce the performance that much.
				#But in DB search, use iterative way.
				pattern = getPatternByKeywordSearchString(searchString)

				#on sentence level first, if can't find, go to paragraph level.
				for paragraph in [article['headline'], article['byline'], article['leadParagraph'], article['tailParagraph']]:
					sentenceList = sent_tokenize(paragraph)
					for sentence in sentenceList:
						if re.search(pattern, sentence) is not None:
							articleSentenceList.append(sentence.encode('utf-8').strip())
				if not articleSentenceList:
					#search on paragraph level
					for paragraph in [article['headline'], article['byline'], article['leadParagraph'], article['tailParagraph']]:
						if re.search(pattern, paragraph) is not None:
							articleSentenceList.append(paragraph.encode('utf-8').strip())
				lineList = [articleCompanyCode, articleCompanyName, article['filePath'], article['_id'], article['date'], article['sourceName'].strip(), article['byline'].strip(), article['headline'].strip(), '\t'.join(articleSentenceList)]
				self._resultQueue.put(lineList)

	def processCitationBlock(self):
		#because list is too long, we need to separate name in to chunk
		brokerNameList = list(self._db.getAllBrokerageEffectiveNameList())
		brokerageNamePatternList = []
		for i in range(0, len(brokerNameList), 500):
			brokerageNamePatternList.append(re.compile(r'|'.join([r'\b' + name + r'\b' for name in brokerNameList[i : i + 500]]), re.IGNORECASE))

		quotePattern = re.compile(r'\"[^\"]+\"')
		citeWordPatternStringList = [(r'\b' + citeWord + r'\b') for citeWord in self._citeWordList]

		companyCEODict = self._db.getAllCompanyCEODict()
		engagerNamePattern = re.compile(r'|'.join(['CEO', 'analyst', 'executive']), re.IGNORECASE)
		citeWordPattern = re.compile(r'|'.join(citeWordPatternStringList), re.IGNORECASE)

		wordMatchPatternList = [getWordRegexPattern(WORD_CAUSE_IN), getWordRegexPattern(WORD_CAUSE_EX), getWordRegexPattern(WORD_CONTROL_LOW), getWordRegexPattern(WORD_CONTROL_HIGH), getWordRegexPattern(MCD_POS), getWordRegexPattern(MCD_NEG), getWordRegexPattern(MCD_UNCERTAIN)]
		filterWordDict = getWordDict(WORD_FILTER)
		while True:
			#process in batch
			articleBatch = self._taskQueue.get()
			if articleBatch == END_OF_QUEUE:
				self._taskQueue.task_done()
				break
			else:
				lineListBatch = []
				toProcessSentenceBatch = []
				sentenceTextIndex, NERStartIndex, NERPartCount, wordMatchStartIndex = 9, 12, 5, 18
				#add byline_cleaned in articleDict
				self.processBylineInBatch(articleBatch)
				for article in articleBatch:
					self._db.setArticleProcessed(article['_id'])
					articlePathPartList = article['filePath'].split('/')
					articleCompanyCode = articlePathPartList[-3] if articlePathPartList[-2] == 'a' else articlePathPartList[-2]
					articleCompany = self._db.getCompanyByCode(articleCompanyCode)
					articleCompanyName = articleCompanyCode if articleCompany is None else articleCompany['name']
					articleLineListPart = [articleCompanyCode, articleCompanyName, article['filePath'], article['_id'], article['date'], article['sourceName'].strip(), article['byline'].strip(), article['byline_cleaned'], article['headline'].strip()]

					for paragraph in [article['leadParagraph'], article['tailParagraph']]:
						#if found qouted part in this paragraph
						quotedStringList = re.findall(quotePattern, paragraph)
						if quotedStringList and max([len(string.split()) for string in quotedStringList]) > 5:
							#Among all the quoted parts, the max word count MUST bigger than 5
							#If so, then get all sentences
							sentenceList = sent_tokenize(paragraph)
							for sentence in sentenceList:
								quotedStringList = re.findall(quotePattern, sentence)
								citeWordList = re.findall(citeWordPattern, sentence)
								#If this sentence has quotation and quoted part word cout is bigger than 5 and has cite word
								#Then parse it add to the export
								if quotedStringList and max([len(string.split()) for string in quotedStringList]) > 5 and citeWordList:
									lineList = articleLineListPart + [sentence, '. '.join(quotedStringList), ', '.join(citeWordList)] + [''] * NERPartCount + [len(sentence.split())] + [''] * len(wordMatchPatternList) * 2
									# Macth the keyword in dictionary
									for i, pattern in enumerate(wordMatchPatternList):
										matchedWordList = getMatchWordListFromPattern(sentence, pattern, filterWordDict)
										lineList[i + wordMatchStartIndex] = len(matchedWordList)
										lineList[i + len(wordMatchPatternList) + wordMatchStartIndex] = ', '.join(matchedWordList)
									lineListBatch.append(lineList)
									toProcessSentenceBatch.append(sentence)
				actorAndOrgListBatch = self.processCiteSentenceInBatch(toProcessSentenceBatch)
				for i, actorAndOrgList in enumerate(actorAndOrgListBatch):
					if actorAndOrgList is not None:
						engagerNameList = re.findall(engagerNamePattern, lineListBatch[i][sentenceTextIndex])
						FCEO = 0
						articleCompanyCode = lineListBatch[i][0]
						for name in actorAndOrgList[0].split(', '):
							for namePart in name.split():
								if articleCompanyCode in companyCEODict and companyCEODict[articleCompanyCode].find(namePart) != -1:
									FCEO = 1
						lineListBatch[i][NERStartIndex] = actorAndOrgList[0]
						lineListBatch[i][NERStartIndex + 1] = actorAndOrgList[1]
						lineListBatch[i][NERStartIndex + 2] = ' '.join(engagerNameList)
						lineListBatch[i][NERStartIndex + 3] = FCEO
						unQuotedPart = re.sub(r'"[^"]+"', '', lineListBatch[i][sentenceTextIndex])
						findBrokerage = False
						for pattern in brokerageNamePatternList:
							result = pattern.search(unQuotedPart)
							if result is not None and result.string[result.regs[0][0]].isupper():
								findBrokerage = True
								break
						lineListBatch[i][NERStartIndex + 4] = 1 if findBrokerage else 0
						self._resultQueue.put(lineListBatch[i])
				self._taskQueue.task_done()

	def getNERTaggedTupleListFromSentence(self, sentence):
		#use senna name entity tagger, it fast!!
		sentence = unicode(sentence).encode('utf-8', 'ignore')
		with open('temp/input.txt', 'w') as f:
			f.write(sentence)
		os.system('./senna/senna -path senna/ -ner <temp/input.txt> temp/output.txt')
		with open('temp/output.txt', 'r') as f:
			tagTupleList = [[word.strip().split('-')[-1] if i ==1  else word.strip() for i, word in enumerate(line.split())] for line in f.readlines() if line.split()]
		return tagTupleList

	def processBylineInBatch(self, articleBatch):
		#use '.' to replace '' of byline, because if the last sentence byline is '', it will not be add to concatenated string.
		tagTupleList = self.getNERTaggedTupleListFromSentence(' ****** '.join([article['byline'] if article['byline'] else 'null.' for article in articleBatch]))

		personList, lastTag, wordList  = [], '', []
		articleIndex = 0
		for i in range(len(tagTupleList)):
			if tagTupleList[i][1] != lastTag:
				if lastTag == 'PER':
					personList.append(' '.join(wordList))
				wordList = [tagTupleList[i][0]]
				lastTag = tagTupleList[i][1]
			else:
				wordList.append(tagTupleList[i][0])

			if tagTupleList[i][0].find('****') != -1 or i == len(tagTupleList) - 1:
				#end of one sentence
				articleBatch[articleIndex]['byline_cleaned'] = ', '.join(personList) if personList else ''
				personList, lastTag, wordList = [], '', []
				articleIndex = articleIndex + 1 if i != len(tagTupleList) - 1 else articleIndex
				if articleIndex >= len(articleBatch):
					return

		while articleIndex < len(articleBatch) :
			articleBatch[articleIndex]['byline_cleaned'] = ''
			articleIndex += 1


	def processCiteSentenceInBatch(self, sentenceBatch):
		tagTupleList = self.getNERTaggedTupleListFromSentence(' ****** '.join(sentenceBatch))

		personAndOrgListBatch = []
		personList, orgnizationList, inQuoteFlag, lastTag, wordList  = [], [], False, '', []
		for i in range(len(tagTupleList)):
			if tagTupleList[i][0] == '\"':
				inQuoteFlag = 1 - inQuoteFlag
				if not inQuoteFlag:
					del wordList[:]
			else:
				if not inQuoteFlag:
					if tagTupleList[i][1] != lastTag:
						if lastTag == 'PER':
							personList.append(' '.join(wordList))
						elif lastTag == 'ORG':
							orgnizationList.append(' '.join(wordList))
						wordList = [tagTupleList[i][0]]
						lastTag = tagTupleList[i][1]
					else:
						wordList.append(tagTupleList[i][0])

			if tagTupleList[i][0].find('****') != -1 or i == len(tagTupleList) - 1:
				#end of one sentence
				if not personList and not orgnizationList:
					personAndOrgListBatch.append(None)
				else:
					personAndOrgListBatch.append([', '.join(personList), ', '.join(orgnizationList)])
				personList, orgnizationList, inQuoteFlag, lastTag, wordList  = [], [], False, '', []

		return personAndOrgListBatch

	def run(self):
		self._executeFunction()