class Query(object): def __init__(self, text_dir, db_url, book_url, should_download=False): """ ``text_dir`` is the directory where a copy of text should be put. ``db_url`` should be the url to a database that already exists. ``should_download`` indicates whether or not ``book_url`` is a local path or a url in the internet. """ self.text_dir = text_dir self.db_url = db_url self.book_url = book_url self.should_download = should_download self.manager = Manager(db_url) self.extractor = Extractor(text_dir) def __enter__(self): self.run() return self def __exit__(self, type, value, traceback): self.clean_up() def run(self): word_rates = self._word_rates() word_categories = self._word_categories(word_rates) wcp = self._word_conditional_probabilities(word_categories) e, r = self._probabilities(wcp) self.elizabethan_factor = e self.romantic_factor = r def results(self): """ Returns a tuple (e, r) with the factor that this book be Elizabethan or Romantic respectively. """ return self.elizabethan_factor, self.romantic_factor def clean_up(self): if self.should_download: os.remove(self.filename) def _word_rates(self): """ Downloads the book if needed, or makes a copy of it. Returns a dictionary of words and their rates. """ if self.should_download: self.filename = self.extractor.download_book(self.book_url, True) else: self.filename = self.book_url word_rates = self.extractor.read_text(self.filename) self.word_rates = word_rates return word_rates def _word_categories(self, word_rates): """ For every word in the database returns a dictionary of word->category according to the rates in the books. Returns an iterable of WordCategory for the category of every word that is both in the book and the database, returns the WordCategory with lowest category for words in the database that did not appear in the book. """ total_words = reduce(lambda x, y: x + y, word_rates.itervalues()) rates = {word: (float(count) / total_words) for word, count in word_rates.iteritems()} words_not_in_book = self.manager.session.query(Word.text).all() words_not_in_book = set(words_not_in_book) - set(rates.keys()) words_not_in_book = list(words_not_in_book) low = self.manager.session.query(Category).\ filter(Category.description == 'low').one() word_count_query = self.manager.session.query(WordCategory) for lst in dict_key_slice(rates, MAX_SLICE_SIZE): words = self.manager.session.query(Word).\ filter(Word.text.in_(lst)).all() for word in words: rate = rates.get(word.text) word_count = word_count_query.filter(WordCategory.id_word == word.id).\ filter(WordCategory.min_range <= rate).\ filter(WordCategory.max_range > rate).one() yield word_count for lst in list_slices(map(lambda i: i[0], words_not_in_book), MAX_SLICE_SIZE): word_count_data = word_count_query.filter(WordCategory.id_word.in_(lst)).\ filter(WordCategory.id_category == low.id).all() for word_count in word_count_data: yield word_count def _word_conditional_probability(self, word_id, category_id, period_id): """ Returns an instace of WordConditionalProbability. """ p = self.manager.session.query(WordConditionalProbability) p = p.filter_by(id_word=word_id, id_category=category_id, id_period=period_id) p = p.one() return p def _word_conditional_probabilities(self, word_categories): """ Receives an iterable of WordCategory objects. Yields a tuples of ``(e, r)`` where ``e`` and ``r`` are the probabilities that the word and category be in Elizabethan and Romantic periods respectively. """ elizabethan = self.manager.elizabethan_period romantic = self.manager.romantic_period for wc in word_categories: word_id = wc.id_word category_id = wc.id_category e = self._word_conditional_probability(word_id, category_id, elizabethan.id).probability r = self._word_conditional_probability(word_id, category_id, romantic.id).probability yield e, r def _probabilities(self, conditional_probabilities): """ Receives an iterable as returned by ``_word_conditional_probabilities``. Returns a tuple ``(e, r)`` of the factor than this book be Elizabethan or Romantic respectively. """ elizabethan_book_count = self.manager.elizabethan_book_count romantic_book_count = self.manager.romantic_book_count total_books = elizabethan_book_count + romantic_book_count elizabethan_probability = float(elizabethan_book_count) / total_books romantic_probability = float(romantic_book_count) / total_books elizabethan_factor = elizabethan_probability romantic_factor = romantic_probability x = 0 for e, r in conditional_probabilities: if e != 0 and r != 0: # elizabethan_factor *= 10 * e * elizabethan_probability # romantic_factor *= 10 * r * romantic_probability if e < 0.1 or r < 0.1: elizabethan_factor *= 100 * e romantic_factor *= 100 * r else: elizabethan_factor *= e romantic_factor *= r if elizabethan_factor == 0 or romantic_factor == 0 or elizabethan_factor == float('Inf') or romantic_factor == float('Inf'): return buffer_elizabethan, buffer_romantic buffer_elizabethan = elizabethan_factor buffer_romantic = romantic_factor # logger.debug( "e = %f, r = %f" % (elizabethan_factor, romantic_factor) ) return elizabethan_factor, romantic_factor def top(self, count): ordered = sorted(self.word_rates.iteritems(), key=lambda x: -x[1]) return ordered[0:count]
class Trainer(object): def __init__(self, json_path, text_dir, db_url): self.json_path = json_path self.text_dir = text_dir self.db_url = db_url if not isdir(self.text_dir): mkdir(self.text_dir) self.extractor = Extractor(text_dir) self.manager = Manager(db_url) def json(self): if not hasattr(self, "_json"): _json = [] texts = {} with open(self.json_path, "r") as f: texts = json.load(f) for text in texts: author = text["Author"] title = text["Title"] period = text["Period"] url = text["URL"] _json.append((author, title, period, url)) return _json def get_books(self): """ Downloads the book if it's not in the texts directory. """ files = [f for f in listdir(self.text_dir)] for author, title, period, url in self.json(): filename = format_filename(author, title) if not filename in files: logger.debug("Getting %s" % filename) book = self.extractor.download_book(url, False, author, title, period) else: logger.debug("%s already downloaded" % filename) def train(self): logger.debug(" STARTING get_books") self.get_books() logger.debug(" STARTING populate") self.populate() logger.debug(" STARTING categories") self.categories() logger.debug(" STARTING conditional_probability") self.conditional_probability() self.manager.session.close_all() def populate(self): output = [] for author, title, period, url in self.json(): # TODO clean the next line words = self.extractor.read_text(format_filename(author, title)) if len(words) == 0: continue total_words = reduce(operator.add, words.values()) #insert period dic_period = {'name': period} list_search = ['name'] period_obj = self.manager.get_or_insert(dict_val=dic_period, instance=models.Period, list_search=list_search) #insert book # logger.debug(words) logger.debug("Total Words: %s", total_words) dic_book = { 'name': title, 'author': author, 'period': period_obj, 'total_words': total_words, 'sentence_total': 0 } list_search = ['name', 'author', 'period'] book_obj = self.manager.get_or_insert(dict_val=dic_book, instance=models.Book, list_search=list_search) #Words filename = format_filename(author, title) if len(words) == 0: continue logger.debug("Period id : %s %s" % (period_obj.id, period_obj.name)) logger.debug("Book id : %s %s %s" % (book_obj.id, book_obj.name, book_obj.author)) self.manager.insert_words(words, book_obj, total_words) def categories(self): words_all = self.manager.get({}, Word, [], True) total = len(words_all) logger.debug(" categories Words %s" % total) for word_obj in words_all: self.calculate_categories(word_obj=word_obj) total -= 1 if total % 500 == 0: logger.debug("Progressing Word -- Category... %s" % total) self.manager.session.commit() def calculate_categories(self, word_obj=None): if not word_obj: return False max_rate, min_rate = self.manager.get_max_min_rate(word_obj) self.manager.construct_categories(min_rate, max_rate, word_obj) def period_probability(self, period, log=False): """ # libros de esa epoca --- # total de libros """ books_period = self.manager.session.query(Book).filter_by( period=period).count() if log: logger.debug(" books_period = %f " % (books_period)) return books_period def word_category_period_probability(self, word, category, period, log=False): """ cuenta cuantos (libros de esa epoca) tienen esa palabra en esa categoria --- numero de libros de esa epoca """ num_books__word_cat = 0 books_period = self.manager.session.query(Book).filter_by( period=period).all() for book in books_period: #el libro contiene la palabra book_word = self.manager.session.query(WordCount).filter_by( book=book, word=word).all() word_category = self.manager.session.query(WordCategory).filter_by( category=category, word=word).one() #if len(book_word)==0, no relation then prob 0 if len(book_word) > 0 and word_category: if book_word[0].rate >= word_category.min_range and book_word[ 0].rate < word_category.max_range: num_books__word_cat += 1 if log: logger.debug(" num_books__word_cat= %f" % (num_books__word_cat)) return num_books__word_cat def probability(self, word, category, period, log=False): """ probabilidad esa palabra en esa categoria en esa epoca --- probabilidad de esa epoca = # libros de esa epoca / cantidad de libros """ word_category_period_probability = self.word_category_period_probability( word, category, period, log=log) period_probability = self.period_probability(period, log=log) if log: logger.debug( " word cat period prob = %f / period prob = %f = %f" % (word_category_period_probability, period_probability, word_category_period_probability / period_probability)) return word_category_period_probability / period_probability def conditional_probability(self): """ """ self.manager.session.query(WordConditionalProbability).delete() bulk = [] words_all = self.manager.session.query(Word).all() periods = self.manager.session.query(Period).all() categories = self.manager.session.query(Category).all() for period in periods: logger.debug(period.name) for category in categories: logger.debug(category.description) total = len(words_all) for word in words_all: #word rate? prob = self.probability(word=word, category=category, period=period) if prob > 1: logger.debug("word %s category %s period %s prob %s" % (word.text, category.description, period.name, prob)) self.probability(word=word, category=category, period=period, log=True) word_cond_prob = WordConditionalProbability( word=word, category=category, period=period, probability=prob) bulk.append(word_cond_prob) total -= 1 if total % 500 == 0: logger.debug("left ... %s words" % total) self.manager.session.add_all(bulk) self.manager.session.commit() self.complete_probability() def complete_probability(self): bulk = [] list_cat = ['med', 'high', 'high_high'] cats_ids = self.manager.session.query(Category).filter( Category.description.in_(list_cat)).all() low = self.manager.session.query(Category).filter( Category.description == 'low').one() words_all = self.manager.session.query(Word).all() periods = self.manager.session.query(Period).all() for period in periods: total = len(words_all) for word in words_all: sum_3cat = self.manager.session.query( func.sum(WordConditionalProbability.probability)).filter( and_( WordConditionalProbability.id_category.in_( c.id for c in cats_ids), WordConditionalProbability.id_word == word.id, WordConditionalProbability.id_period == period.id)).all()[0][0] cat_low = self.manager.session.query( WordConditionalProbability).filter( and_(WordConditionalProbability.id_category == low.id, WordConditionalProbability.id_word == word.id, WordConditionalProbability.id_period == period.id)).all() cat_low[0].probability = 1 - sum_3cat # print "word_id %s period %d sum %s" %(word.id,period.id,sum_3cat) total -= 1 if total % 500 == 0: logger.debug("left ... %s words" % total) self.manager.session.commit()
class Query(object): def __init__(self, text_dir, db_url, book_url, should_download=False): """ ``text_dir`` is the directory where a copy of text should be put. ``db_url`` should be the url to a database that already exists. ``should_download`` indicates whether or not ``book_url`` is a local path or a url in the internet. """ self.text_dir = text_dir self.db_url = db_url self.book_url = book_url self.should_download = should_download self.manager = Manager(db_url) self.extractor = Extractor(text_dir) def __enter__(self): self.run() return self def __exit__(self, type, value, traceback): self.clean_up() def run(self): word_rates = self._word_rates() word_categories = self._word_categories(word_rates) wcp = self._word_conditional_probabilities(word_categories) e, r = self._probabilities(wcp) self.elizabethan_factor = e self.romantic_factor = r def results(self): """ Returns a tuple (e, r) with the factor that this book be Elizabethan or Romantic respectively. """ return self.elizabethan_factor, self.romantic_factor def clean_up(self): if self.should_download: os.remove(self.filename) def _word_rates(self): """ Downloads the book if needed, or makes a copy of it. Returns a dictionary of words and their rates. """ if self.should_download: self.filename = self.extractor.download_book(self.book_url, True) else: self.filename = self.book_url word_rates = self.extractor.read_text(self.filename) self.word_rates = word_rates return word_rates def _word_categories(self, word_rates): """ For every word in the database returns a dictionary of word->category according to the rates in the books. Returns an iterable of WordCategory for the category of every word that is both in the book and the database, returns the WordCategory with lowest category for words in the database that did not appear in the book. """ total_words = reduce(lambda x, y: x + y, word_rates.itervalues()) rates = { word: (float(count) / total_words) for word, count in word_rates.iteritems() } words_not_in_book = self.manager.session.query(Word.text).all() words_not_in_book = set(words_not_in_book) - set(rates.keys()) words_not_in_book = list(words_not_in_book) low = self.manager.session.query(Category).\ filter(Category.description == 'low').one() word_count_query = self.manager.session.query(WordCategory) for lst in dict_key_slice(rates, MAX_SLICE_SIZE): words = self.manager.session.query(Word).\ filter(Word.text.in_(lst)).all() for word in words: rate = rates.get(word.text) word_count = word_count_query.filter(WordCategory.id_word == word.id).\ filter(WordCategory.min_range <= rate).\ filter(WordCategory.max_range > rate).one() yield word_count for lst in list_slices(map(lambda i: i[0], words_not_in_book), MAX_SLICE_SIZE): word_count_data = word_count_query.filter(WordCategory.id_word.in_(lst)).\ filter(WordCategory.id_category == low.id).all() for word_count in word_count_data: yield word_count def _word_conditional_probability(self, word_id, category_id, period_id): """ Returns an instace of WordConditionalProbability. """ p = self.manager.session.query(WordConditionalProbability) p = p.filter_by(id_word=word_id, id_category=category_id, id_period=period_id) p = p.one() return p def _word_conditional_probabilities(self, word_categories): """ Receives an iterable of WordCategory objects. Yields a tuples of ``(e, r)`` where ``e`` and ``r`` are the probabilities that the word and category be in Elizabethan and Romantic periods respectively. """ elizabethan = self.manager.elizabethan_period romantic = self.manager.romantic_period for wc in word_categories: word_id = wc.id_word category_id = wc.id_category e = self._word_conditional_probability(word_id, category_id, elizabethan.id).probability r = self._word_conditional_probability(word_id, category_id, romantic.id).probability yield e, r def _probabilities(self, conditional_probabilities): """ Receives an iterable as returned by ``_word_conditional_probabilities``. Returns a tuple ``(e, r)`` of the factor than this book be Elizabethan or Romantic respectively. """ elizabethan_book_count = self.manager.elizabethan_book_count romantic_book_count = self.manager.romantic_book_count total_books = elizabethan_book_count + romantic_book_count elizabethan_probability = float(elizabethan_book_count) / total_books romantic_probability = float(romantic_book_count) / total_books elizabethan_factor = elizabethan_probability romantic_factor = romantic_probability x = 0 for e, r in conditional_probabilities: if e != 0 and r != 0: # elizabethan_factor *= 10 * e * elizabethan_probability # romantic_factor *= 10 * r * romantic_probability if e < 0.1 or r < 0.1: elizabethan_factor *= 100 * e romantic_factor *= 100 * r else: elizabethan_factor *= e romantic_factor *= r if elizabethan_factor == 0 or romantic_factor == 0 or elizabethan_factor == float( 'Inf') or romantic_factor == float('Inf'): return buffer_elizabethan, buffer_romantic buffer_elizabethan = elizabethan_factor buffer_romantic = romantic_factor # logger.debug( "e = %f, r = %f" % (elizabethan_factor, romantic_factor) ) return elizabethan_factor, romantic_factor def top(self, count): ordered = sorted(self.word_rates.iteritems(), key=lambda x: -x[1]) return ordered[0:count]
class Trainer(object): def __init__(self, json_path, text_dir, db_url): self.json_path = json_path self.text_dir = text_dir self.db_url = db_url if not isdir(self.text_dir): mkdir(self.text_dir) self.extractor = Extractor(text_dir) self.manager = Manager(db_url) def json(self): if not hasattr(self, "_json"): _json = [] texts = {} with open(self.json_path, "r") as f: texts = json.load(f) for text in texts: author = text["Author"] title = text["Title"] period = text["Period"] url = text["URL"] _json.append((author, title, period, url)) return _json def get_books(self): """ Downloads the book if it's not in the texts directory. """ files = [f for f in listdir(self.text_dir)] for author, title, period, url in self.json(): filename = format_filename(author, title) if not filename in files: logger.debug("Getting %s" % filename) book = self.extractor.download_book(url, False, author, title, period) else: logger.debug("%s already downloaded" % filename) def train(self): logger.debug(" STARTING get_books") self.get_books() logger.debug(" STARTING populate") self.populate() logger.debug(" STARTING categories") self.categories() logger.debug(" STARTING conditional_probability") self.conditional_probability() self.manager.session.close_all() def populate(self): output = [] for author, title, period, url in self.json(): # TODO clean the next line words = self.extractor.read_text(format_filename(author, title)) if len(words) == 0: continue total_words = reduce(operator.add, words.values()) #insert period dic_period = {'name':period} list_search = ['name'] period_obj = self.manager.get_or_insert(dict_val=dic_period, instance=models.Period, list_search=list_search) #insert book # logger.debug(words) logger.debug("Total Words: %s", total_words) dic_book = {'name':title, 'author':author, 'period':period_obj, 'total_words':total_words, 'sentence_total':0} list_search = ['name','author','period'] book_obj = self.manager.get_or_insert(dict_val=dic_book, instance=models.Book,list_search=list_search) #Words filename = format_filename(author, title) if len(words) == 0: continue logger.debug("Period id : %s %s" % (period_obj.id,period_obj.name)) logger.debug("Book id : %s %s %s" % (book_obj.id,book_obj.name,book_obj.author)) self.manager.insert_words(words,book_obj,total_words) def categories(self): words_all = self.manager.get({},Word,[],True) total = len(words_all) logger.debug(" categories Words %s" % total) for word_obj in words_all: self.calculate_categories(word_obj=word_obj) total -= 1 if total % 500 ==0: logger.debug("Progressing Word -- Category... %s" % total) self.manager.session.commit() def calculate_categories(self, word_obj=None): if not word_obj: return False max_rate, min_rate = self.manager.get_max_min_rate(word_obj) self.manager.construct_categories(min_rate,max_rate, word_obj) def period_probability(self, period, log=False): """ # libros de esa epoca --- # total de libros """ books_period = self.manager.session.query(Book).filter_by(period=period).count() if log: logger.debug(" books_period = %f " % (books_period)) return books_period def word_category_period_probability(self, word, category, period, log=False): """ cuenta cuantos (libros de esa epoca) tienen esa palabra en esa categoria --- numero de libros de esa epoca """ num_books__word_cat = 0 books_period = self.manager.session.query(Book).filter_by(period=period).all() for book in books_period: #el libro contiene la palabra book_word = self.manager.session.query(WordCount).filter_by( book=book,word=word).all() word_category = self.manager.session.query(WordCategory).filter_by( category=category,word=word).one() #if len(book_word)==0, no relation then prob 0 if len(book_word) > 0 and word_category: if book_word[0].rate >= word_category.min_range and book_word[0].rate < word_category.max_range: num_books__word_cat += 1 if log: logger.debug(" num_books__word_cat= %f" % (num_books__word_cat)) return num_books__word_cat def probability(self, word, category, period, log=False): """ probabilidad esa palabra en esa categoria en esa epoca --- probabilidad de esa epoca = # libros de esa epoca / cantidad de libros """ word_category_period_probability = self.word_category_period_probability(word, category, period, log=log) period_probability = self.period_probability(period, log=log) if log: logger.debug(" word cat period prob = %f / period prob = %f = %f" % (word_category_period_probability,period_probability,word_category_period_probability/period_probability)) return word_category_period_probability/period_probability def conditional_probability(self): """ """ self.manager.session.query(WordConditionalProbability).delete() bulk = [] words_all = self.manager.session.query(Word).all() periods = self.manager.session.query(Period).all() categories = self.manager.session.query(Category).all() for period in periods: logger.debug(period.name) for category in categories: logger.debug(category.description) total = len(words_all) for word in words_all: #word rate? prob = self.probability( word=word, category=category, period=period) if prob > 1: logger.debug("word %s category %s period %s prob %s" % (word.text,category.description, period.name, prob)) self.probability(word=word,category=category,period=period, log=True) word_cond_prob = WordConditionalProbability( word=word, category=category, period=period, probability=prob) bulk.append(word_cond_prob) total -= 1 if total % 500 == 0: logger.debug("left ... %s words" % total) self.manager.session.add_all(bulk) self.manager.session.commit() self.complete_probability() def complete_probability(self): bulk = [] list_cat = ['med','high','high_high'] cats_ids = self.manager.session.query(Category).filter(Category.description.in_(list_cat)).all() low = self.manager.session.query(Category).filter(Category.description=='low').one() words_all = self.manager.session.query(Word).all() periods = self.manager.session.query(Period).all() for period in periods: total = len(words_all) for word in words_all: sum_3cat = self.manager.session.query( func.sum(WordConditionalProbability.probability)).filter( and_(WordConditionalProbability.id_category.in_(c.id for c in cats_ids), WordConditionalProbability.id_word == word.id, WordConditionalProbability.id_period == period.id) ).all()[0][0] cat_low = self.manager.session.query(WordConditionalProbability).filter( and_(WordConditionalProbability.id_category == low.id, WordConditionalProbability.id_word == word.id, WordConditionalProbability.id_period == period.id) ).all() cat_low[0].probability = 1 - sum_3cat # print "word_id %s period %d sum %s" %(word.id,period.id,sum_3cat) total -= 1 if total % 500 == 0: logger.debug("left ... %s words" % total) self.manager.session.commit()