Exemple #1
0
 def lProbLine(self, l, threshold=None):
     total = 0.
     for w in tokenize(l):
         lp = self.lp(w)
         if threshold == None or lp >= -threshold:
             total += lp
     return total
Exemple #2
0
 def lProbLine(self, l, threshold=None):
     total = 0.0
     for w in tokenize(l):
         lp = self.lp(w)
         if threshold == None or lp >= -threshold:
             total += lp
     return total
Exemple #3
0
 def fetchData(self, author):
     pubs = author.paper.publication_set.all()[:5]
     titles = [a.full_title() for a in pubs]
     for r in author.paper.oairecord_set.all()[:5]:
         if r.keywords:
             titles.append(r.keywords)
     titles = map(lambda t: set(filter_punctuation(tokenize(t))), titles)
     return titles
Exemple #4
0
 def nlProbLine(self, l):
     total = 0.0
     lgt = 0
     for w in tokenize(l):
         total += self.lp(w)
         lgt += 1
     if lgt > 0:
         return total / lgt
     return 0.0
Exemple #5
0
 def nlProbLine(self, l):
     total = 0.
     lgt = 0
     for w in tokenize(l):
         total += self.lp(w)
         lgt += 1
     if lgt > 0:
         return total / lgt
     return 0.
Exemple #6
0
 def _normalizedWScore(self, line, researcher, explain=False):
     topicScore = self.models[researcher.department_id].nlProbLine(line)
     langScore = self.lang.nlProbLine(line)
     if explain:
         words = tokenize(line)
         for w in words:
             a = self.models[researcher.department_id].lp(w)
             b = self.lang.lp(w)
             print('      '+w+'\t'+str(a)+'-'+str(b)+' = '+str(a-b))
     return topicScore - langScore
Exemple #7
0
 def _normalizedWScore(self, line, researcher, explain=False):
     topicScore = self.models[researcher.department_id].nlProbLine(line)
     langScore = self.lang.nlProbLine(line)
     if explain:
         words = tokenize(line)
         for w in words:
             a = self.models[researcher.department_id].lp(w)
             b = self.lang.lp(w)
             print('      ' + w + '\t' + str(a) + '-' + str(b) + ' = ' +
                   str(a - b))
     return topicScore - langScore
Exemple #8
0
    def get_distr(self, string, debug=False):
        # Tokenize
        words = tokenize(string)

        # To BOW
        bow = self.dct.doc2bow(words)

        # To topics
        distr = self.lda[bow]
        if debug:
            for (topic_id,value) in distr[:10]:
                print "Topic id %d, value %.3f" % (topic_id,value)
                print self._print_topic(self.lda.show_topic(topic_id))

        return distr
Exemple #9
0
 def probLine(self, l):
     total = 1.0
     for w in tokenize(l):
         total *= self.p(w)
     return total
Exemple #10
0
 def feedLine(self, l):
     for w in tokenize(l):
         self._countWord(w)
Exemple #11
0
 def test_tokenize(self):
     self.assertEqual(tokenize('Hello world!'), ['Hello', 'world!'])
     self.assertEqual(tokenize('99\tbottles\nof  beeron \tThe Wall'), ['99', 'bottles', 'of', 'beeron', 'The', 'Wall'])
Exemple #12
0
 def fetchData(self, author):
     contributors = [r.contributors for r in author.paper.oairecord_set.all()]
     contributors = filter(lambda x: x != None, contributors)
     ta = ' '.join(contributors)
     return set(filter_punctuation(tokenize(ta)))
Exemple #13
0
 def fetchData(self, author):
     return set(filter_punctuation(tokenize(author.paper.title)))
Exemple #14
0
 def probLine(self, l):
     total = 1.
     for w in tokenize(l):
         total *= self.p(w)
     return total
Exemple #15
0
 def feedLine(self, l):
     for w in tokenize(l):
         self._countWord(w)