Esempio n. 1
0
 def global_pmi(self, ngram):
     ngram_probability = self.global_probability(ngram)
     # use iterable also for one element
     members_probability = product(
         self.global_probability([s]) for s in ngram
         if s.has_meaning_alone())
     return math.log(ngram_probability / members_probability)
Esempio n. 2
0
 def statistical_mutual_information_confidence(self):
     """
     Number of occurrences of the ngram / number of ngrams possible
     /
     probability of each member of the ngram.
     """
     if self._confidences["statistical_mutual_information"] is None:
         if len(self) == 1: return 1.0#TODO : find better way for 1-grams...
         ngram_possible = len(self.text) - len(self) + 1
         members_probability = product([1.0 * s.count/len(self.text) for s in self])
         self._confidences["statistical_mutual_information"] = \
         math.log(1.0 * self.count / ngram_possible / members_probability)
     return self._confidences["statistical_mutual_information"]
Esempio n. 3
0
 def statistical_mutual_information_confidence(self):
     """
     Number of occurrences of the ngram / number of ngrams possible
     /
     probability of each member of the ngram.
     """
     if self._confidences["statistical_mutual_information"] is None:
         if len(self) == 1:
             return 1.0  # TODO : find better way for 1-grams...
         ngram_possible = len(self.text) - len(self) + 1
         members_probability = product(
             [1.0 * s.count / len(self.text) for s in self])
         self._confidences["statistical_mutual_information"] = \
         math.log(1.0 * self.count / ngram_possible / members_probability)
     return self._confidences["statistical_mutual_information"]
Esempio n. 4
0
 def heuristical_mutual_information_confidence(self):
     """
     Return the probability of all the terms of the ngram to appear together.
     The matter is to understand the dependance or independance of the terms.
     If just some terms appears out of this context, it may be normal (for
     exemple, a name, which appaers sometimes with both firstname and lastname
     and sometimes with just lastname). And if these terms appears many many
     times, but some others appears just in this context, the number doesn't
     count.
     If NO term appears out of this context, with have a good probability for
     a collocation.
     If each term appears out of this context, and specialy if this occurs
     often, we can doubt of this collocation candidate.
     Do we may consider the stop_words ?
     This may affect negativly and positivly the main confidence.
     """
     if self._confidences["heuristical_mutual_information"] is None:
         #We test just from interessant stemms, but we keep original position
         candidates = [(k, v) for k, v in enumerate(self)
                       if v.has_meaning()]
         alone_count = {}
         if len(self) == 1:
             return 1  # Just one word, PMI doesn't make sense
         if len(candidates) == 0:
             return 0.1
         for position, stemm in candidates:
             alone_count[position] = 0
             neighbours = [(s, p - position) for p, s in enumerate(self)
                           if not s is stemm]
             for tkn in stemm.occurrences:
                 if not tkn.is_neighbor(neighbours):
                     alone_count[position] += 1
         res = [v for k, v in alone_count.items()]
         if sum(res) == 0:
             return 3 * len(self)  # We trust this collocation
         elif 0 in res:  # Almost one important term appears just in this context
             return 2
         else:
             #We don't know, but not so confident...
             #The more the terms appears alone, the less we are confident
             #So the smaller is the coef
             return product(
                 [2.0 * len(self) / (len(self) + v) for v in res])
     return self._confidences["heuristical_mutual_information"]
Esempio n. 5
0
 def heuristical_mutual_information_confidence(self):
     """
     Return the probability of all the terms of the ngram to appear together.
     The matter is to understand the dependance or independance of the terms.
     If just some terms appears out of this context, it may be normal (for
     exemple, a name, which appaers sometimes with both firstname and lastname
     and sometimes with just lastname). And if these terms appears many many
     times, but some others appears just in this context, the number doesn't
     count.
     If NO term appears out of this context, with have a good probability for
     a collocation.
     If each term appears out of this context, and specialy if this occurs
     often, we can doubt of this collocation candidate.
     Do we may consider the stop_words ?
     This may affect negativly and positivly the main confidence.
     """
     if self._confidences["heuristical_mutual_information"] is None:
         #We test just from interessant stemms, but we keep original position
         candidates = [(k, v) for k, v in enumerate(self) if v.has_meaning()]
         alone_count = {}
         if len(self) == 1:
             return 1  # Just one word, PMI doesn't make sense
         if len(candidates) == 0:
             return 0.1
         for position, stemm in candidates:
             alone_count[position] = 0
             neighbours = [(s, p - position) for p, s in enumerate(self) if not s is stemm]
             for tkn in stemm.occurrences:
                 if not tkn.is_neighbor(neighbours):
                     alone_count[position] += 1
         res = [v for k, v in alone_count.items()]
         if sum(res) == 0:
             return 3 * len(self)  # We trust this collocation
         elif 0 in res:  # Almost one important term appears just in this context
             return 2
         else:
             #We don't know, but not so confident...
             #The more the terms appears alone, the less we are confident
             #So the smaller is the coef
             return product([2.0 * len(self) / (len(self) + v) for v in res])
     return self._confidences["heuristical_mutual_information"]
Esempio n. 6
0
 def global_pmi(self, ngram):
     ngram_probability = self.global_probability(ngram)
     # use iterable also for one element
     members_probability = product(self.global_probability([s]) for s in ngram if s.has_meaning_alone())
     return math.log(ngram_probability / members_probability)