Python DBController.updateSentencePfm 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: DBController

클래스/타입: DBController

메소드/함수: updateSentencePfm

hotexamples.com에서의 예제들: 1

Python DBController.updateSentencePfm - 1개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 DBController.DBController.updateSentencePfm에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

DBController(17)

add_document(3)

get_all_articles(3)

saveArticle(3)

get_article(2)

getFeatureListByWeek(2)

getEngagerById(2)

getCompanyById(2)

getAllUserList(2)

getAllSentence(2)

not_updated_list(2)

create_table(2)

commit_n_close(2)

select_data_single(2)

getTop50Rank(2)

update_isdownload(2)

update_app(2)

updateCompanyCEO(1)

insertMTVReviewToDB(1)

member_activities(1)

isFeatureInDB(1)

update_activity(1)

insert_data(1)

insertTop50ChartToDB(1)

update_admin(1)

insertSpeech(1)

insertSession(1)

insertSentence(1)

insertSalesChartToDB(1)

update_data(1)

update_is_downloaded(1)

insertIMVDBDataToDB(1)

member_login(1)

insertFeatureToDB(1)

insertEngager(1)

insertConference(1)

insertCompany(1)

insertCommentToDB(1)

get_old_category_app_list(1)

get_all_documents(1)

update_member(1)

getYoutubeData(1)

getUserCount(1)

getUserById(1)

getUnprocessedArticleInBatch(1)

member_is_present(1)

member_status_in_activity(1)

storeEntry(1)

updateSentencePfm(1)

updateDietHistory(1)

예제 #1

파일 보기

파일: SignifierParser.py 프로젝트: exsonic/CorpusAnalysis

class SignifierParser(object):

	def __init__(self):
		self.db = DBController()
		self.pfmWord = getWordList(WORD_PFM)
		self.posWord = getWordList(WORD_POS)
		self.negWord = getWordList(WORD_NEG)
		self.exWord = getWordList(ATRB_EX)
		self.inWord = getWordList(ATRB_IN)
		self.citeWord = getWordList(WORD_CITE)
		self.engagerList = list(self.db.getAllEngager())
		self.companyList = list(self.db.getAllCompany())

		self.engagerRegexPatternDict, self.companyRegexPatternDict = self.getRegexPatternDictForEngagerAndCompany()

	def getRegexPatternDictForEngagerAndCompany(self):
		engagerRegexPatternDict, companyRegexPatternDict = {}, {}
		for engager in self.engagerList:
			if engager['lastName'] == 'Jones' or engager['lastName'] == 'Johnson' or engager['lastName'] == 'West' or engager['lastName'] == 'Post' or engager['lastName'] == 'Ford':
				searchName = engager['name']
			else:
				searchName = engager['lastName']
			engagerRegexPatternDict[engager['_id']] = re.compile(r'\b' + searchName + r'\b')

		for company in self.companyList:
			companyRegexPatternDict[company['_id']] = re.compile(r'\b' + company['shortName'] + r'\b', re.IGNORECASE)

		return engagerRegexPatternDict, companyRegexPatternDict

	def extractAllSentenceToDB(self, isReload=False):
		if isReload:
			self.db.dropSentence()
		# for company in self.companies:
		for i, company in enumerate(self.companyList):
			articles = list(self.db.getAllArticleByCompanyCode(company['code']))
			engagers = list(self.db.getAllEngagerByCompanyId(company['_id']))
			for j, article in enumerate(articles):
				print(i, j)
				paragraphSet = ('leadParagraph', 'tailParagraph')
				for key in paragraphSet:
					paragraph = article[key]
					sentenceList = sent_tokenize(paragraph)
					for string in sentenceList:
						if not isValidSentence(string):
							continue
						sentenceDict = {'content' : string.encode('utf-8'), 'articleId' : article['_id'], 'paragraph' : key}
						sentenceDict = self.parseRawSentence(sentenceDict, engagers)
						if sentenceDict is not None:
							self.db.insertSentence(sentenceDict)

	def parseRawSentence(self, sentenceDict, engagerList):
		engagerIdList, companyIdList = [], []
		for engager in engagerList:
			if self.engagerRegexPatternDict[engager['_id']].search(sentenceDict['content']) is not None:
				engagerIdList.append(engager['_id'])

		for company in self.companyList:
			if self.companyRegexPatternDict[company['_id']].search(sentenceDict['content']) is not None:
				companyIdList.append(company['_id'])

		if not engagerIdList and not companyIdList:
			return None
		else:
			sentenceDict['engager'] = list(set(engagerIdList))
			sentenceDict['company'] = list(set(companyIdList))
			return sentenceDict

	def parseAllSentenceCitation(self):
		sentences = list(self.db.getAllSentence())
		for i, sentence in enumerate(sentences):
			print(i)
			words = getProcessedWordList(sentence['content'], VERB)
			sentence['cite'] = filter(lambda  word : word in self.citeWord, words)
			sentence['citeCEO'], sentence['citeAnalyst'], sentence['citeCompany'] = self.isCiteInDistance(sentence)
			self.db.saveSentence(sentence)

	def parseAllSentencePfm(self):
		#list them all, becaue if loop with cursor and update cursor pointed sentence at meantime, the cursor will be screwed.
		sentences = list(self.db.getAllSentence())
		for i, sentence in enumerate(sentences):
			print(i)
			pfmSentenceWordList = getProcessedWordList(sentence['content'], NOUN)
			pfmWordList = filter(lambda word : word in self.pfmWord, pfmSentenceWordList)
			posNegSentenceWordList = getProcessedWordList(sentence['content'], VERB)
			posWordList = filter(lambda word : word in self.posWord, posNegSentenceWordList)
			negWordList = filter(lambda word : word in self.negWord, posNegSentenceWordList)

			posWordList, negWordList = self.filterPosNegWordListByDistance(pfmSentenceWordList, posNegSentenceWordList, pfmWordList, posWordList, negWordList)

			self.db.updateSentencePfm(sentence['_id'], pfmWordList, posWordList, negWordList)

	def parseAllSentenceAtrb(self):
		sentences = list(self.db.getAllSentence())
		for i, sentence in enumerate(sentences):
			print(i)
			words = getProcessedWordList(sentence['content'], NOUN)
			exWordList = filter(lambda word : word in self.exWord, words)
			inWordList = filter(lambda word : word in self.inWord, words)
			if ('ceo' in inWordList or 'executive' in inWordList) and sentence['cite']:
				inWordList = []
			self.db.updateSentenceAtrb(sentence['_id'], exWordList, inWordList)

	def isCiteInDistance(self, sentence):
		#if (CEO or Company) and citation word happens within 5 word distance, capture
		isCiteCEO, isCiteAnalyst, isCiteCompany = False, False, False
		if sentence['cite']:
			wordList = getProcessedWordList(sentence['content'], VERB)
			for citeWord in sentence['cite']:
				citeIndex = wordList.index(citeWord)
				for engagerId in sentence['engager']:
					try:
						engager = self.db.getEngagerById(engagerId)
						matchName = engager['lastName'].lower()
						engagerIndex = wordList.index(matchName)
						if abs(citeIndex - engagerIndex) <= CITE_DISTANCE:
							if engager['type'] == ENGAGER_CEO:
								isCiteCEO = True
							else:
								isCiteAnalyst = True
					except:
						pass

				for companyId in sentence['company']:
					try:
						company = self.db.getCompanyById(companyId)
						matchName = company['shortName'].lower()
						companyIndex = wordList.index(matchName)
						if abs(citeIndex - companyIndex) <= CITE_DISTANCE:
							isCiteCompany = True
					except:
						pass
		return isCiteCEO, isCiteAnalyst, isCiteCompany

	def filterPosNegWordListByDistance(self, pfmSentenceWordList, posNegSentenceWordList, pfmWordList, posWordList, negWordList):
		filteredPosWordList, filteredNegWordList = [],[]
		for pfmWord in pfmWordList:
			pfmIndex = pfmSentenceWordList.index(pfmWord)
			for posWord in posWordList:
				posIndex = posNegSentenceWordList.index(posWord)
				if abs(pfmIndex - posIndex) <= PFM_DISTANCE:
					filteredPosWordList.append(posWord)
			for negWord in negWordList:
				negIndex = posNegSentenceWordList.index(negWord)
				if abs(pfmIndex - negIndex) <= PFM_DISTANCE:
					filteredNegWordList.append(negWord)
		return filteredPosWordList, filteredNegWordList