Example #1
0
class Query(object):

    def __init__(self, text_dir, db_url, book_url, should_download=False):
        """
        ``text_dir`` is the directory where a copy of text should be put.
        ``db_url`` should be the url to a database that already exists.
        ``should_download`` indicates whether or not ``book_url`` is a local
        path or a url in the internet.
        """
        self.text_dir = text_dir
        self.db_url = db_url
        self.book_url = book_url
        self.should_download = should_download
        self.manager = Manager(db_url)
        self.extractor = Extractor(text_dir)

    def __enter__(self):
        self.run()
        return self

    def __exit__(self, type, value, traceback):
        self.clean_up()

    def run(self):
        word_rates = self._word_rates()
        word_categories = self._word_categories(word_rates)
        wcp = self._word_conditional_probabilities(word_categories)
        e, r = self._probabilities(wcp)
        self.elizabethan_factor = e
        self.romantic_factor = r

    def results(self):
        """
        Returns a tuple (e, r) with the factor that this book be Elizabethan
        or Romantic respectively.
        """
        return self.elizabethan_factor, self.romantic_factor


    def clean_up(self):
        if self.should_download:
            os.remove(self.filename)
        
    def _word_rates(self):
        """
        Downloads the book if needed, or makes a copy of it.
        Returns a dictionary of words and their rates.
        """
        if self.should_download:
            self.filename = self.extractor.download_book(self.book_url, True)
        else:
            self.filename = self.book_url
        word_rates = self.extractor.read_text(self.filename)
        self.word_rates = word_rates
        return word_rates

    def _word_categories(self, word_rates):
        """
        For every word in the database returns a dictionary of word->category
        according to the rates in the books.
        Returns an iterable of WordCategory for the category of every word that
        is both in the book and the database, returns the WordCategory with
        lowest category for words in the database that did not appear in the
        book.
        """
        total_words = reduce(lambda x, y: x + y, word_rates.itervalues())
        rates = {word: (float(count) / total_words)
                 for word, count in word_rates.iteritems()}
        words_not_in_book = self.manager.session.query(Word.text).all()
        words_not_in_book = set(words_not_in_book) - set(rates.keys())
        words_not_in_book = list(words_not_in_book)

        low = self.manager.session.query(Category).\
            filter(Category.description == 'low').one()
        word_count_query = self.manager.session.query(WordCategory)
        for lst in dict_key_slice(rates, MAX_SLICE_SIZE):
            words = self.manager.session.query(Word).\
                filter(Word.text.in_(lst)).all()
            for word in words:
                rate = rates.get(word.text)
                word_count = word_count_query.filter(WordCategory.id_word == word.id).\
                    filter(WordCategory.min_range <= rate).\
                    filter(WordCategory.max_range > rate).one()
                yield word_count

        for lst in list_slices(map(lambda i: i[0], words_not_in_book), MAX_SLICE_SIZE):
            word_count_data = word_count_query.filter(WordCategory.id_word.in_(lst)).\
                filter(WordCategory.id_category == low.id).all()
            for word_count in word_count_data:
                yield word_count

        
    def _word_conditional_probability(self, word_id, category_id, period_id):
        """
        Returns an instace of WordConditionalProbability.
        """
        p = self.manager.session.query(WordConditionalProbability)
        p = p.filter_by(id_word=word_id, id_category=category_id,
            id_period=period_id)
        p = p.one()
        return p
    
    def _word_conditional_probabilities(self, word_categories):
        """
        Receives an iterable of WordCategory objects.
        Yields a tuples of ``(e, r)`` where ``e`` and ``r`` are the
        probabilities that the word and category be in Elizabethan and Romantic
        periods respectively.
        """
        elizabethan = self.manager.elizabethan_period
        romantic = self.manager.romantic_period

        for wc in word_categories:
            word_id = wc.id_word
            category_id = wc.id_category
            e = self._word_conditional_probability(word_id, category_id,
                elizabethan.id).probability
            r = self._word_conditional_probability(word_id, category_id,
                romantic.id).probability
            yield e, r

    def _probabilities(self, conditional_probabilities):
        """
        Receives an iterable as returned by
        ``_word_conditional_probabilities``.
        
        Returns a tuple ``(e, r)`` of the factor than this book be Elizabethan
        or Romantic respectively.
        """
        elizabethan_book_count = self.manager.elizabethan_book_count
        romantic_book_count = self.manager.romantic_book_count
        total_books = elizabethan_book_count + romantic_book_count
        elizabethan_probability = float(elizabethan_book_count) / total_books
        romantic_probability = float(romantic_book_count) / total_books
        elizabethan_factor =  elizabethan_probability
        romantic_factor =  romantic_probability
        x = 0
        for e, r in conditional_probabilities:
            if e != 0 and r != 0:
                # elizabethan_factor *= 10 * e * elizabethan_probability
                # romantic_factor *= 10 * r * romantic_probability
                if e < 0.1 or r < 0.1:
                    elizabethan_factor *=  100 * e
                    romantic_factor *= 100 * r
                else:
                    elizabethan_factor *=  e
                    romantic_factor *= r

                if elizabethan_factor == 0 or romantic_factor == 0 or elizabethan_factor == float('Inf') or romantic_factor == float('Inf'):
                    return buffer_elizabethan, buffer_romantic

                buffer_elizabethan = elizabethan_factor
                buffer_romantic = romantic_factor
                # logger.debug( "e = %f, r = %f" % (elizabethan_factor, romantic_factor) )
        return elizabethan_factor, romantic_factor

    def top(self, count):
        ordered = sorted(self.word_rates.iteritems(), key=lambda x: -x[1])
        return ordered[0:count]
Example #2
0
class Trainer(object):
    def __init__(self, json_path, text_dir, db_url):
        self.json_path = json_path
        self.text_dir = text_dir
        self.db_url = db_url
        if not isdir(self.text_dir):
            mkdir(self.text_dir)
        self.extractor = Extractor(text_dir)
        self.manager = Manager(db_url)

    def json(self):
        if not hasattr(self, "_json"):
            _json = []
            texts = {}
            with open(self.json_path, "r") as f:
                texts = json.load(f)
            for text in texts:
                author = text["Author"]
                title = text["Title"]
                period = text["Period"]
                url = text["URL"]
                _json.append((author, title, period, url))
        return _json

    def get_books(self):
        """
        Downloads the book if it's not in the texts directory.
        """
        files = [f for f in listdir(self.text_dir)]
        for author, title, period, url in self.json():
            filename = format_filename(author, title)
            if not filename in files:
                logger.debug("Getting %s" % filename)
                book = self.extractor.download_book(url, False, author, title,
                                                    period)
            else:
                logger.debug("%s already downloaded" % filename)

    def train(self):
        logger.debug("      STARTING get_books")
        self.get_books()
        logger.debug("      STARTING populate")
        self.populate()
        logger.debug("      STARTING categories")
        self.categories()
        logger.debug("      STARTING conditional_probability")
        self.conditional_probability()
        self.manager.session.close_all()

    def populate(self):
        output = []
        for author, title, period, url in self.json():
            # TODO clean the next line
            words = self.extractor.read_text(format_filename(author, title))
            if len(words) == 0:
                continue
            total_words = reduce(operator.add, words.values())
            #insert period
            dic_period = {'name': period}
            list_search = ['name']
            period_obj = self.manager.get_or_insert(dict_val=dic_period,
                                                    instance=models.Period,
                                                    list_search=list_search)
            #insert book
            # logger.debug(words)
            logger.debug("Total Words: %s", total_words)
            dic_book = {
                'name': title,
                'author': author,
                'period': period_obj,
                'total_words': total_words,
                'sentence_total': 0
            }
            list_search = ['name', 'author', 'period']
            book_obj = self.manager.get_or_insert(dict_val=dic_book,
                                                  instance=models.Book,
                                                  list_search=list_search)
            #Words
            filename = format_filename(author, title)

            if len(words) == 0:
                continue

            logger.debug("Period id : %s %s" %
                         (period_obj.id, period_obj.name))
            logger.debug("Book id : %s %s %s" %
                         (book_obj.id, book_obj.name, book_obj.author))
            self.manager.insert_words(words, book_obj, total_words)

    def categories(self):
        words_all = self.manager.get({}, Word, [], True)
        total = len(words_all)
        logger.debug("  categories Words %s" % total)
        for word_obj in words_all:
            self.calculate_categories(word_obj=word_obj)
            total -= 1
            if total % 500 == 0:
                logger.debug("Progressing Word -- Category... %s" % total)
        self.manager.session.commit()

    def calculate_categories(self, word_obj=None):
        if not word_obj:
            return False
        max_rate, min_rate = self.manager.get_max_min_rate(word_obj)
        self.manager.construct_categories(min_rate, max_rate, word_obj)

    def period_probability(self, period, log=False):
        """
        # libros de esa epoca
        ---
        # total de libros
        """
        books_period = self.manager.session.query(Book).filter_by(
            period=period).count()
        if log:
            logger.debug("      books_period = %f " % (books_period))
        return books_period

    def word_category_period_probability(self,
                                         word,
                                         category,
                                         period,
                                         log=False):
        """
        cuenta cuantos (libros de esa epoca) tienen esa palabra en esa categoria
        ---
        numero de libros de esa epoca
        """
        num_books__word_cat = 0
        books_period = self.manager.session.query(Book).filter_by(
            period=period).all()
        for book in books_period:
            #el libro contiene la palabra
            book_word = self.manager.session.query(WordCount).filter_by(
                book=book, word=word).all()
            word_category = self.manager.session.query(WordCategory).filter_by(
                category=category, word=word).one()

            #if len(book_word)==0, no relation then prob 0
            if len(book_word) > 0 and word_category:
                if book_word[0].rate >= word_category.min_range and book_word[
                        0].rate < word_category.max_range:
                    num_books__word_cat += 1
        if log:
            logger.debug("      num_books__word_cat= %f" %
                         (num_books__word_cat))

        return num_books__word_cat

    def probability(self, word, category, period, log=False):
        """
        probabilidad esa palabra en esa categoria en esa epoca
        ---
        probabilidad de esa epoca = # libros de esa epoca / cantidad de libros
        """
        word_category_period_probability = self.word_category_period_probability(
            word, category, period, log=log)
        period_probability = self.period_probability(period, log=log)
        if log:
            logger.debug(
                "  word cat period prob = %f / period prob = %f = %f" %
                (word_category_period_probability, period_probability,
                 word_category_period_probability / period_probability))
        return word_category_period_probability / period_probability

    def conditional_probability(self):
        """
        """
        self.manager.session.query(WordConditionalProbability).delete()
        bulk = []
        words_all = self.manager.session.query(Word).all()
        periods = self.manager.session.query(Period).all()
        categories = self.manager.session.query(Category).all()
        for period in periods:
            logger.debug(period.name)
            for category in categories:
                logger.debug(category.description)
                total = len(words_all)
                for word in words_all:
                    #word rate?
                    prob = self.probability(word=word,
                                            category=category,
                                            period=period)
                    if prob > 1:
                        logger.debug("word %s category %s  period %s prob %s" %
                                     (word.text, category.description,
                                      period.name, prob))
                        self.probability(word=word,
                                         category=category,
                                         period=period,
                                         log=True)
                    word_cond_prob = WordConditionalProbability(
                        word=word,
                        category=category,
                        period=period,
                        probability=prob)
                    bulk.append(word_cond_prob)
                    total -= 1
                    if total % 500 == 0:
                        logger.debug("left ... %s words" % total)
        self.manager.session.add_all(bulk)
        self.manager.session.commit()
        self.complete_probability()

    def complete_probability(self):
        bulk = []
        list_cat = ['med', 'high', 'high_high']
        cats_ids = self.manager.session.query(Category).filter(
            Category.description.in_(list_cat)).all()
        low = self.manager.session.query(Category).filter(
            Category.description == 'low').one()

        words_all = self.manager.session.query(Word).all()
        periods = self.manager.session.query(Period).all()
        for period in periods:
            total = len(words_all)
            for word in words_all:
                sum_3cat = self.manager.session.query(
                    func.sum(WordConditionalProbability.probability)).filter(
                        and_(
                            WordConditionalProbability.id_category.in_(
                                c.id for c in cats_ids),
                            WordConditionalProbability.id_word == word.id,
                            WordConditionalProbability.id_period ==
                            period.id)).all()[0][0]
                cat_low = self.manager.session.query(
                    WordConditionalProbability).filter(
                        and_(WordConditionalProbability.id_category == low.id,
                             WordConditionalProbability.id_word == word.id,
                             WordConditionalProbability.id_period ==
                             period.id)).all()
                cat_low[0].probability = 1 - sum_3cat
                # print "word_id %s period %d sum %s" %(word.id,period.id,sum_3cat)
                total -= 1
                if total % 500 == 0:
                    logger.debug("left ... %s words" % total)
        self.manager.session.commit()
Example #3
0
class Query(object):
    def __init__(self, text_dir, db_url, book_url, should_download=False):
        """
        ``text_dir`` is the directory where a copy of text should be put.
        ``db_url`` should be the url to a database that already exists.
        ``should_download`` indicates whether or not ``book_url`` is a local
        path or a url in the internet.
        """
        self.text_dir = text_dir
        self.db_url = db_url
        self.book_url = book_url
        self.should_download = should_download
        self.manager = Manager(db_url)
        self.extractor = Extractor(text_dir)

    def __enter__(self):
        self.run()
        return self

    def __exit__(self, type, value, traceback):
        self.clean_up()

    def run(self):
        word_rates = self._word_rates()
        word_categories = self._word_categories(word_rates)
        wcp = self._word_conditional_probabilities(word_categories)
        e, r = self._probabilities(wcp)
        self.elizabethan_factor = e
        self.romantic_factor = r

    def results(self):
        """
        Returns a tuple (e, r) with the factor that this book be Elizabethan
        or Romantic respectively.
        """
        return self.elizabethan_factor, self.romantic_factor

    def clean_up(self):
        if self.should_download:
            os.remove(self.filename)

    def _word_rates(self):
        """
        Downloads the book if needed, or makes a copy of it.
        Returns a dictionary of words and their rates.
        """
        if self.should_download:
            self.filename = self.extractor.download_book(self.book_url, True)
        else:
            self.filename = self.book_url
        word_rates = self.extractor.read_text(self.filename)
        self.word_rates = word_rates
        return word_rates

    def _word_categories(self, word_rates):
        """
        For every word in the database returns a dictionary of word->category
        according to the rates in the books.
        Returns an iterable of WordCategory for the category of every word that
        is both in the book and the database, returns the WordCategory with
        lowest category for words in the database that did not appear in the
        book.
        """
        total_words = reduce(lambda x, y: x + y, word_rates.itervalues())
        rates = {
            word: (float(count) / total_words)
            for word, count in word_rates.iteritems()
        }
        words_not_in_book = self.manager.session.query(Word.text).all()
        words_not_in_book = set(words_not_in_book) - set(rates.keys())
        words_not_in_book = list(words_not_in_book)

        low = self.manager.session.query(Category).\
            filter(Category.description == 'low').one()
        word_count_query = self.manager.session.query(WordCategory)
        for lst in dict_key_slice(rates, MAX_SLICE_SIZE):
            words = self.manager.session.query(Word).\
                filter(Word.text.in_(lst)).all()
            for word in words:
                rate = rates.get(word.text)
                word_count = word_count_query.filter(WordCategory.id_word == word.id).\
                    filter(WordCategory.min_range <= rate).\
                    filter(WordCategory.max_range > rate).one()
                yield word_count

        for lst in list_slices(map(lambda i: i[0], words_not_in_book),
                               MAX_SLICE_SIZE):
            word_count_data = word_count_query.filter(WordCategory.id_word.in_(lst)).\
                filter(WordCategory.id_category == low.id).all()
            for word_count in word_count_data:
                yield word_count

    def _word_conditional_probability(self, word_id, category_id, period_id):
        """
        Returns an instace of WordConditionalProbability.
        """
        p = self.manager.session.query(WordConditionalProbability)
        p = p.filter_by(id_word=word_id,
                        id_category=category_id,
                        id_period=period_id)
        p = p.one()
        return p

    def _word_conditional_probabilities(self, word_categories):
        """
        Receives an iterable of WordCategory objects.
        Yields a tuples of ``(e, r)`` where ``e`` and ``r`` are the
        probabilities that the word and category be in Elizabethan and Romantic
        periods respectively.
        """
        elizabethan = self.manager.elizabethan_period
        romantic = self.manager.romantic_period

        for wc in word_categories:
            word_id = wc.id_word
            category_id = wc.id_category
            e = self._word_conditional_probability(word_id, category_id,
                                                   elizabethan.id).probability
            r = self._word_conditional_probability(word_id, category_id,
                                                   romantic.id).probability
            yield e, r

    def _probabilities(self, conditional_probabilities):
        """
        Receives an iterable as returned by
        ``_word_conditional_probabilities``.
        
        Returns a tuple ``(e, r)`` of the factor than this book be Elizabethan
        or Romantic respectively.
        """
        elizabethan_book_count = self.manager.elizabethan_book_count
        romantic_book_count = self.manager.romantic_book_count
        total_books = elizabethan_book_count + romantic_book_count
        elizabethan_probability = float(elizabethan_book_count) / total_books
        romantic_probability = float(romantic_book_count) / total_books
        elizabethan_factor = elizabethan_probability
        romantic_factor = romantic_probability
        x = 0
        for e, r in conditional_probabilities:
            if e != 0 and r != 0:
                # elizabethan_factor *= 10 * e * elizabethan_probability
                # romantic_factor *= 10 * r * romantic_probability
                if e < 0.1 or r < 0.1:
                    elizabethan_factor *= 100 * e
                    romantic_factor *= 100 * r
                else:
                    elizabethan_factor *= e
                    romantic_factor *= r

                if elizabethan_factor == 0 or romantic_factor == 0 or elizabethan_factor == float(
                        'Inf') or romantic_factor == float('Inf'):
                    return buffer_elizabethan, buffer_romantic

                buffer_elizabethan = elizabethan_factor
                buffer_romantic = romantic_factor
                # logger.debug( "e = %f, r = %f" % (elizabethan_factor, romantic_factor) )
        return elizabethan_factor, romantic_factor

    def top(self, count):
        ordered = sorted(self.word_rates.iteritems(), key=lambda x: -x[1])
        return ordered[0:count]
Example #4
0
class Trainer(object):

    def __init__(self, json_path, text_dir, db_url):
        self.json_path = json_path
        self.text_dir = text_dir
        self.db_url = db_url
        if not isdir(self.text_dir):
            mkdir(self.text_dir)
        self.extractor = Extractor(text_dir)
        self.manager = Manager(db_url)

    def json(self):
        if not hasattr(self, "_json"):
            _json = []
            texts = {}
            with open(self.json_path, "r") as f:
                texts = json.load(f)
            for text in texts:
                author = text["Author"]
                title = text["Title"]
                period = text["Period"]
                url = text["URL"]
                _json.append((author, title, period, url))
        return _json

    def get_books(self):
        """
        Downloads the book if it's not in the texts directory.
        """
        files = [f for f in listdir(self.text_dir)]
        for author, title, period, url in self.json():
            filename = format_filename(author, title)
            if not filename in files:
                logger.debug("Getting %s" % filename)
                book = self.extractor.download_book(url, False, author, title, period)
            else:
                logger.debug("%s already downloaded" % filename)

    def train(self):
        logger.debug("      STARTING get_books")
        self.get_books()
        logger.debug("      STARTING populate")
        self.populate()
        logger.debug("      STARTING categories")
        self.categories()
        logger.debug("      STARTING conditional_probability")
        self.conditional_probability()
        self.manager.session.close_all()    

    def populate(self):
        output = []
        for author, title, period, url in self.json():
            # TODO clean the next line
            words = self.extractor.read_text(format_filename(author, title))
            if len(words) == 0:
                continue
            total_words = reduce(operator.add, words.values())
            #insert period
            dic_period = {'name':period}
            list_search = ['name']
            period_obj = self.manager.get_or_insert(dict_val=dic_period,
                instance=models.Period, list_search=list_search)
            #insert book
            # logger.debug(words)
            logger.debug("Total Words: %s", total_words)
            dic_book = {'name':title,
                'author':author,
                'period':period_obj,
                'total_words':total_words,
                'sentence_total':0}
            list_search = ['name','author','period']
            book_obj = self.manager.get_or_insert(dict_val=dic_book,
                instance=models.Book,list_search=list_search)
            #Words
            filename = format_filename(author, title)
            
            if len(words) == 0:
                continue

            logger.debug("Period id : %s %s" % (period_obj.id,period_obj.name))
            logger.debug("Book id : %s %s %s" % (book_obj.id,book_obj.name,book_obj.author))
            self.manager.insert_words(words,book_obj,total_words)

    def categories(self):
        words_all = self.manager.get({},Word,[],True)
        total = len(words_all)
        logger.debug("  categories Words %s" % total)
        for word_obj in words_all:
            self.calculate_categories(word_obj=word_obj)
            total -= 1
            if total % 500 ==0:
                logger.debug("Progressing Word -- Category... %s" % total)
        self.manager.session.commit()

    def calculate_categories(self, word_obj=None):
        if not word_obj:
            return False
        max_rate, min_rate = self.manager.get_max_min_rate(word_obj)
        self.manager.construct_categories(min_rate,max_rate, word_obj)


    def period_probability(self, period, log=False):
        """
        # libros de esa epoca
        ---
        # total de libros
        """
        books_period = self.manager.session.query(Book).filter_by(period=period).count()
        if log:
            logger.debug("      books_period = %f " % (books_period))
        return books_period


    def word_category_period_probability(self, word, category, period, log=False):
        """
        cuenta cuantos (libros de esa epoca) tienen esa palabra en esa categoria
        ---
        numero de libros de esa epoca
        """
        num_books__word_cat = 0
        books_period = self.manager.session.query(Book).filter_by(period=period).all()
        for book in books_period:
            #el libro contiene la palabra
            book_word = self.manager.session.query(WordCount).filter_by(
                book=book,word=word).all()
            word_category = self.manager.session.query(WordCategory).filter_by(
                category=category,word=word).one()
            
            #if len(book_word)==0, no relation then prob 0 
            if len(book_word) > 0 and word_category:
                if book_word[0].rate >= word_category.min_range and book_word[0].rate < word_category.max_range:
                    num_books__word_cat += 1
        if log:
            logger.debug("      num_books__word_cat= %f" % (num_books__word_cat))

        return num_books__word_cat

    def probability(self, word, category, period, log=False):
        """
        probabilidad esa palabra en esa categoria en esa epoca
        ---
        probabilidad de esa epoca = # libros de esa epoca / cantidad de libros
        """
        word_category_period_probability = self.word_category_period_probability(word, category, period, log=log)
        period_probability = self.period_probability(period, log=log)
        if log:
            logger.debug("  word cat period prob = %f / period prob = %f = %f" % (word_category_period_probability,period_probability,word_category_period_probability/period_probability))
        return word_category_period_probability/period_probability


    def conditional_probability(self):
        """
        """
        self.manager.session.query(WordConditionalProbability).delete()
        bulk = []
        words_all = self.manager.session.query(Word).all()
        periods = self.manager.session.query(Period).all()
        categories = self.manager.session.query(Category).all()
        for period in periods:
            logger.debug(period.name)
            for category in categories:
                logger.debug(category.description)
                total = len(words_all)
                for word in words_all:
                    #word rate?
                    prob = self.probability(
                        word=word,
                        category=category,
                        period=period)
                    if prob > 1:
                        logger.debug("word %s category %s  period %s prob %s" % (word.text,category.description, period.name, prob))
                        self.probability(word=word,category=category,period=period, log=True)
                    word_cond_prob = WordConditionalProbability(
                        word=word,
                        category=category,
                        period=period,
                        probability=prob)
                    bulk.append(word_cond_prob)
                    total -= 1
                    if total % 500 == 0:
                        logger.debug("left ... %s words" % total)
        self.manager.session.add_all(bulk)
        self.manager.session.commit()
        self.complete_probability()

    def complete_probability(self):
        bulk = []
        list_cat = ['med','high','high_high']
        cats_ids = self.manager.session.query(Category).filter(Category.description.in_(list_cat)).all()
        low = self.manager.session.query(Category).filter(Category.description=='low').one()

        words_all = self.manager.session.query(Word).all()
        periods = self.manager.session.query(Period).all()
        for period in periods:
            total = len(words_all)
            for word in words_all:
                sum_3cat = self.manager.session.query(
                    func.sum(WordConditionalProbability.probability)).filter(
                        and_(WordConditionalProbability.id_category.in_(c.id for c in cats_ids),
                            WordConditionalProbability.id_word == word.id,
                            WordConditionalProbability.id_period == period.id)
                    ).all()[0][0]
                cat_low = self.manager.session.query(WordConditionalProbability).filter(
                        and_(WordConditionalProbability.id_category == low.id,
                            WordConditionalProbability.id_word == word.id,
                            WordConditionalProbability.id_period == period.id)
                    ).all()
                cat_low[0].probability = 1 - sum_3cat
                # print "word_id %s period %d sum %s" %(word.id,period.id,sum_3cat)
                total -= 1
                if total % 500 == 0:
                    logger.debug("left ... %s words" % total)
        self.manager.session.commit()