def parseAllSentencePfm(self): #list them all, becaue if loop with cursor and update cursor pointed sentence at meantime, the cursor will be screwed. sentences = list(self.db.getAllSentence()) for i, sentence in enumerate(sentences): print(i) pfmSentenceWordList = getProcessedWordList(sentence['content'], NOUN) pfmWordList = filter(lambda word : word in self.pfmWord, pfmSentenceWordList) posNegSentenceWordList = getProcessedWordList(sentence['content'], VERB) posWordList = filter(lambda word : word in self.posWord, posNegSentenceWordList) negWordList = filter(lambda word : word in self.negWord, posNegSentenceWordList) posWordList, negWordList = self.filterPosNegWordListByDistance(pfmSentenceWordList, posNegSentenceWordList, pfmWordList, posWordList, negWordList) self.db.updateSentencePfm(sentence['_id'], pfmWordList, posWordList, negWordList)
def isCiteInDistance(self, sentence): #if (CEO or Company) and citation word happens within 5 word distance, capture isCiteCEO, isCiteAnalyst, isCiteCompany = False, False, False if sentence['cite']: wordList = getProcessedWordList(sentence['content'], VERB) for citeWord in sentence['cite']: citeIndex = wordList.index(citeWord) for engagerId in sentence['engager']: try: engager = self.db.getEngagerById(engagerId) matchName = engager['lastName'].lower() engagerIndex = wordList.index(matchName) if abs(citeIndex - engagerIndex) <= CITE_DISTANCE: if engager['type'] == ENGAGER_CEO: isCiteCEO = True else: isCiteAnalyst = True except: pass for companyId in sentence['company']: try: company = self.db.getCompanyById(companyId) matchName = company['shortName'].lower() companyIndex = wordList.index(matchName) if abs(citeIndex - companyIndex) <= CITE_DISTANCE: isCiteCompany = True except: pass return isCiteCEO, isCiteAnalyst, isCiteCompany
def parseAllSentenceCitation(self): sentences = list(self.db.getAllSentence()) for i, sentence in enumerate(sentences): print(i) words = getProcessedWordList(sentence['content'], VERB) sentence['cite'] = filter(lambda word : word in self.citeWord, words) sentence['citeCEO'], sentence['citeAnalyst'], sentence['citeCompany'] = self.isCiteInDistance(sentence) self.db.saveSentence(sentence)
def parseAllSentenceAtrb(self): sentences = list(self.db.getAllSentence()) for i, sentence in enumerate(sentences): print(i) words = getProcessedWordList(sentence['content'], NOUN) exWordList = filter(lambda word : word in self.exWord, words) inWordList = filter(lambda word : word in self.inWord, words) if ('ceo' in inWordList or 'executive' in inWordList) and sentence['cite']: inWordList = [] self.db.updateSentenceAtrb(sentence['_id'], exWordList, inWordList)
def getSentenceMatrixAndIdList(self, sentences): table = self.db.getUnigramTable() matrix, idList = [], [] i = 0 for sentence in sentences: print i i += 1 wordList = getProcessedWordList(sentence['content']) vector = self.getSentenceVector(table, wordList) matrix.append(vector) idList.append(sentence['_id']) return numpy.array(matrix), idList