Exemple #1
0
 def __init__(self,
              filters=tuple(),
              upload_type=UploadedFile,
              upload_storage=None):
     FieldProperty.__init__(self, _UploadedFileSchema())
     self._filters = filters
     self._upload_type = upload_type
     self._upload_storage = upload_storage
Exemple #2
0
class InformationValueResult(MappedClass):
    def __init__(self,
                 iv_words,
                 sum_threshold=config.SUM_THRESHOLD,
                 *args,
                 **kwargs):
        if type(iv_words) is dict:
            iv_words = list(iv_words.iteritems())
        self.sum_threshold = sum_threshold
        super(InformationValueResult, self).__init__(*args,
                                                     iv_words=iv_words,
                                                     **kwargs)

    @property
    def iv_sum(self):
        # Todo: improve performance of this...
        sorted_ivs = sorted(map(operator.itemgetter(1), self.iv_words),
                            reverse=True)
        self.max_iv = sorted_ivs[0]
        amount_to_be_taken = int(len(sorted_ivs) * self.sum_threshold) or 10
        sorted_ivs = sorted_ivs[:amount_to_be_taken]
        # Sum the reverse of sorted_words to improve numerical stability
        return reduce(lambda x, y: x + y, reversed(sorted_ivs), 0)

    class __mongometa__:
        session = odm_session
        name = 'information_value_result'
        unique_indexes = [
            ('doc_window_hash', ),
        ]
        extensions = [DocumentWindowSizeDuplicateHash]

    def __repr__(self):
        return "IVR(%s window size, %s iv-words)" % (self.window_size,
                                                     len(self.iv_words))

    def __str__(self):
        return self.__repr__()

    _id = FieldProperty(schema.ObjectId)
    doc_window_hash = FieldProperty(schema.String)
    window_size = FieldProperty(schema.Int)
    iv_words = FieldProperty(schema.Array(schema.Anything))  # Array or list
    document_id = ForeignIdProperty('Document')
    document = RelationProperty('Document')
Exemple #3
0
        def __get__(self, instance, owner=None):
            try:
                value = FieldProperty.__get__(self, instance, owner)
            except AttributeError:
                value = None

            if not value:
                return None

            return self._upload_type(value)
Exemple #4
0
    def __get__(self, instance, owner=None):
        try:
            value = FieldProperty.__get__(self, instance, owner)
        except AttributeError:
            value = None

        if not value:
            return None

        return self._upload_type(value)
Exemple #5
0
        def __set__(self, instance, value):
            if value is not None and not isinstance(value, UploadedFile):
                upload_type = self._upload_type
                value = upload_type(value, self._upload_storage)

            if isinstance(value, UploadedFile):
                value._apply_filters(self._filters)

            old_value = self.__get__(instance, instance.__class__)
            DepotExtension.get_depot_history(instance).swap(old_value, value)
            return FieldProperty.__set__(self, instance, value)
Exemple #6
0
    def __set__(self, instance, value):
        if value is not None and not isinstance(value, UploadedFile):
            upload_type = self._upload_type
            value = upload_type(value)

        if isinstance(value, UploadedFile):
            value._apply_filters(self._filters)

        old_value = self.__get__(instance, instance.__class__)
        DepotExtension.get_depot_history(instance).swap(old_value, value)
        return FieldProperty.__set__(self, instance, value)
Exemple #7
0
 def __init__(self,  filters=tuple(), upload_type=UploadedFile, upload_storage=None):
     FieldProperty.__init__(self, Anything())
     self._filters = filters
     self._upload_type = upload_type
     self._upload_storage = upload_storage
Exemple #8
0
 def __init__(self, filters=tuple(), upload_type=UploadedFile):
     FieldProperty.__init__(self, Anything())
     self._filters = filters
     self._upload_type = upload_type
Exemple #9
0
class Document(MappedClass):
    class __mongometa__:
        session = odm_session
        name = 'document'

    _id = FieldProperty(schema.ObjectId)
    url = FieldProperty(schema.String, unique=True)
    name = FieldProperty(schema.String)
    text = FieldProperty(schema.String)
    month = FieldProperty(schema.String)
    year = FieldProperty(schema.Int)
    related_sense = FieldProperty(schema.Anything)
    number_of_words = FieldProperty(schema.Int)
    results = RelationProperty(InformationValueResult)

    def top_words(self,
                  total_words=20,
                  stop_words=stopwords.words('english'),
                  greater_than_zero=True,
                  window_size=None):
        log.info('Calculate top words. total %s' % total_words)
        if window_size is None:
            window_size = self.get_information_value_result().window_size
        iv_words = self.get_iv_by_window_size(window_size)
        iv_words = [
            (t[0], t[1]) for t in iv_words
            if t[0] not in stop_words and (not greater_than_zero or t[1] > 0.0)
        ][:total_words]

        effective_total_words = max(total_words, len(iv_words))
        return [(word, 1.0 / effective_total_words)
                for (word, iv_value) in iv_words]

    def top_senses(self, total_senses=20):
        log.info('Calculate top senses. total %s' % total_senses)
        top_words = (word for word, _ in self.top_words(total_senses * 2))
        senses = []
        for word in top_words:
            try:
                sense = self.__get_sense_for(word)
                senses.append(sense)
                if len(senses) >= total_senses:
                    break
            except wisdom.NoSenseFound:
                pass
        odm_session.flush()
        return senses

    def __get_sense_for(self, word):

        if not self.related_sense.has_key(word):
            sense = wisdom.lesk(self.text, word, pos='n')
            self.related_sense[word] = sense.name
        return wn.synset(self.related_sense[word])

    # calculator_class is poor man's dependency injection :)
    def get_iv_by_window_size(self,
                              window_size,
                              calculator_class=InformationValueCalculator):
        sort = lambda iv_words: sorted(
            iv_words, key=operator.itemgetter(1), reverse=True)

        for res in self.results:
            if res.window_size == window_size:
                return sort(res.iv_words)

        iv_words = calculator_class(self.tokens).information_value(window_size)
        res = InformationValueResult(window_size=window_size,
                                     document=self,
                                     iv_words=iv_words)

        try:
            odm_session.flush()
        except DuplicateKeyError:
            pass
            return sort(res.iv_words)

    def get_information_value_result(self):
        iv_res = None
        best_iv = 0.0
        #sort = lambda iv_words: sorted(iv_words, key=operator.itemgetter(1), reverse=True)
        for one_iv in self.results:
            sum_iv = sum(map(lambda (w, iv): iv, one_iv.iv_words))
            if best_iv <= sum_iv:
                best_iv = sum_iv
                iv_res = one_iv
        #iv_res.iv_words = sort(iv_res.iv_words)
        return iv_res

    @property
    def tokens(self):
        tokenizer_func = getattr(self, 'tokenizer', tokenize)
        return tokenizer_func(self.text)

    #trivial, removes 'Lenin: ' as prefix

    @property
    def short_name(self):
        ss = self.name.replace("Lenin: ", "")
        return ss[:40 + ss[40:].find(" ")] + "..."

    #generators test
    def result_list(self):
        for each in self.results:
            yield each

    @property
    def total_results(self):
        return len(self.results)

    @property
    def total_tokens(self):
        return len(self.tokens)

    @property
    def sentences(self):
        return sent_tokenize(self.text)

    def __str__(self):
        return self.__repr__()

    def __repr__(self):
        params = (unicode(self.year).encode('utf-8'),
                  unicode(self.month.capitalize()).encode('utf-8'),
                  unicode(self.short_name).encode('utf-8'),
                  unicode(self.total_tokens).encode('utf-8'),
                  unicode(self.total_results).encode('utf-8'))
        if self.total_results > 0:
            res = "Doc(%s, %s - %s, %s tks, %s res:" % params
            for iv_res in self.result_list():
                res += " " + iv_res.__repr__()
                res += ")"
            return res
        else:
            return "Doc(%s, %s - %s, %s tks, %s res)" % params

    #unset all words with 0.0 as value for iv_words of all IVResults
    def no_zero_results(self):
        res = list()
        for each in self.results:
            res.append(self.aux_clean_zeros(each))
        return res

    #Takes an IVResults and clean all iv_words with 0.0
    def aux_clean_zeros(self, result):
        res = dict()
        for w, c in result.iv_words.items():
            if c > 0.0:
                res[w] = c
        result.iv_words = res
        #print result.iv_words
        return result