def lProbLine(self, l, threshold=None): total = 0. for w in tokenize(l): lp = self.lp(w) if threshold == None or lp >= -threshold: total += lp return total
def lProbLine(self, l, threshold=None): total = 0.0 for w in tokenize(l): lp = self.lp(w) if threshold == None or lp >= -threshold: total += lp return total
def fetchData(self, author): pubs = author.paper.publication_set.all()[:5] titles = [a.full_title() for a in pubs] for r in author.paper.oairecord_set.all()[:5]: if r.keywords: titles.append(r.keywords) titles = map(lambda t: set(filter_punctuation(tokenize(t))), titles) return titles
def nlProbLine(self, l): total = 0.0 lgt = 0 for w in tokenize(l): total += self.lp(w) lgt += 1 if lgt > 0: return total / lgt return 0.0
def nlProbLine(self, l): total = 0. lgt = 0 for w in tokenize(l): total += self.lp(w) lgt += 1 if lgt > 0: return total / lgt return 0.
def _normalizedWScore(self, line, researcher, explain=False): topicScore = self.models[researcher.department_id].nlProbLine(line) langScore = self.lang.nlProbLine(line) if explain: words = tokenize(line) for w in words: a = self.models[researcher.department_id].lp(w) b = self.lang.lp(w) print(' '+w+'\t'+str(a)+'-'+str(b)+' = '+str(a-b)) return topicScore - langScore
def _normalizedWScore(self, line, researcher, explain=False): topicScore = self.models[researcher.department_id].nlProbLine(line) langScore = self.lang.nlProbLine(line) if explain: words = tokenize(line) for w in words: a = self.models[researcher.department_id].lp(w) b = self.lang.lp(w) print(' ' + w + '\t' + str(a) + '-' + str(b) + ' = ' + str(a - b)) return topicScore - langScore
def get_distr(self, string, debug=False): # Tokenize words = tokenize(string) # To BOW bow = self.dct.doc2bow(words) # To topics distr = self.lda[bow] if debug: for (topic_id,value) in distr[:10]: print "Topic id %d, value %.3f" % (topic_id,value) print self._print_topic(self.lda.show_topic(topic_id)) return distr
def probLine(self, l): total = 1.0 for w in tokenize(l): total *= self.p(w) return total
def feedLine(self, l): for w in tokenize(l): self._countWord(w)
def test_tokenize(self): self.assertEqual(tokenize('Hello world!'), ['Hello', 'world!']) self.assertEqual(tokenize('99\tbottles\nof beeron \tThe Wall'), ['99', 'bottles', 'of', 'beeron', 'The', 'Wall'])
def fetchData(self, author): contributors = [r.contributors for r in author.paper.oairecord_set.all()] contributors = filter(lambda x: x != None, contributors) ta = ' '.join(contributors) return set(filter_punctuation(tokenize(ta)))
def fetchData(self, author): return set(filter_punctuation(tokenize(author.paper.title)))
def probLine(self, l): total = 1. for w in tokenize(l): total *= self.p(w) return total