def _c_convert_timestamp(val): if not val: return None try: ret = _c_speedup.parse_date(val.strip()) except: ret = None if ret is None: return parse_date(val, as_utc=False) year, month, day, hour, minutes, seconds, tzsecs = ret try: return datetime(year, month, day, hour, minutes, seconds, tzinfo=tzoffset(None, tzsecs)).astimezone(local_tz) except OverflowError: return UNDEFINED_DATE.astimezone(local_tz)
def count_genres(self): ''' Count the genres and list the most common ones. ''' genres = [ {'label':"Science Fiction", 'tags':["science fiction*", "scifi*", "science-fiction*", "space*", "*other planet*", "sagas"]}, {'label':"Fantasy", 'tags':["fantasy*", "magic*"]}, {'label':"Adventure", 'tags':["*adventure*", "*pirates*"]}, {'label':"Thriller", 'tags':["*thriller*", "suspense*", "psychological*", "espionage*"]}, {'label':"Mystery", 'tags':["mystery*", "*detective*", "*sleuth*", "murder*"]}, {'label':"Romance", 'tags':["*romance*", "love*", "*romantic*"]}, {'label':"Historical", 'tags':["*historical*"]}, {'label':"Humour", 'tags':["*humour*", "*humorous*", "*parody*", "*satire*", "*satirical*", "humor*"]}, {'label':"Criminal", 'tags':["Criminal*", "Police*", "hard-boiled*", "crime"]}, {'label':"Military", 'tags':["*military*", "*war*"]}, {'label':"Erotica", 'tags':["*erotica*", "anal", "bdsm", "sex"]}, {'label':"Religion", 'tags':["*religio*", "*christianity*", "*islam*", "*muslim*", "*buddhism*", "*hinduism*", "*catholi*", "*protestantism*"]}, {'label':"Horror", 'tags':["*horror*", "*fear*", "*zombies*"]}, {'label':"Classics", 'tags':["classics", "literature -classics"]}, {'label':"Juvenile", 'tags':["*juvenile*", "*children's*"]}, {'label':"Non Fiction", 'tags':["biography*", "*non fiction*", "*memoirs*", "*business*econom*", "travel", "computers*", "finance", "mathematics", "physics", "zoology", "programming*", "social science*", "political science*", "medical", "usenet", "reference*", "science", "*non-fiction*", "language arts*", "philosophy", "*edcuation*", "*nonfiction*"]}, ] locations = [ {'label':"America", 'tags':["*america*", "*usa*", "*canad*", "*mexico*", "*brazil*", "*new york*", "NY", "*california*", "boston", "*los angeles*", "la", "*massachusetts*", "*texas*", "*north carolina*", "westerns", "*chicago*", "*florida*", "*maine*", "*alaska*", "n.y.", "seattle*", "new england", "new jersey", "*manhattan*", "*illinois*", "minnesota", "*new orleans*", "united states*", "montana", "las vegas*"]}, {'label':"Europe", 'tags':["*europe*", "*sweden*", "*germany*", "*britain*", "*france*", "*italy*", "*ireland*", "*spain*", "*portugal*", "*poland*", "*russia*", "london", "rome", "english*", "*scotland*", "ireland", "england", "paris", "soviet*", "wales", "greece", "*(wales)"]}, {'label':"Africa", 'tags':["*africa*", "*egypt*"]}, {'label':"Asia", 'tags':["*asia*", "*japan*", "*china*", "*hongkong*", "*singapore*", "*india*", "*iraq*", "*iran*", "*middle east*", "*far east*", "*vietnam*", "*pakistan*"]}, {'label':"Oceania", 'tags':["*ocenania*", "*australia*", "*new zeeland*"]}, ] over_generic_tags = ["fiction", "general", "literary", "fiction - general", "ebook", "book", "general & literary fiction", "essays", "general fiction", "fiction & literature", "popular literature"] for genre in genres: # Change globs to regexps. genre['tags'] = self.tag_list_to_regexp(genre['tags']) genre['count'] = 0 for location in locations: # Change globs to regexps. location['tags'] = self.tag_list_to_regexp(location['tags']) location['count'] = 0 # print(self.db.FIELD_MAP) tags_column_idx = self.db.FIELD_MAP['tags'] pubdate_column_idx = self.db.FIELD_MAP['pubdate'] title_column_idx = self.db.FIELD_MAP['title'] rating_column_idx = self.db.FIELD_MAP['rating'] format_column_idx = self.db.FIELD_MAP['formats'] # labels = ["Total", "Unknown", "Science Fiction", "Fantasy", "Adventure", "Thriller", "Mystery", "Romance"] # counts = [0, 0, 0, 0, 0, 0, 0, 0]; # Total, None/Other, Science Fiction, Fantasy, Adventure, Thriller, Mystery, Romance total_book_count = 0 unknown_genre_book_count = 0 unknown_location_book_count = 0 common_tags_on_unknown_genre = {} common_tags_on_unknown_location = {} year_histogram = {} unknown_year_book_count = 0 rating_integer_histogram = { 1: 0, 2: 0, 3: 0, 4: 0, 5: 0 } rating_exact_histogram = { 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0 } title_word_counter = {} format_count_histogram = { 0: 0, 1: 0, 2: 0 } format_counter = {} custom_rating_counters = [] custom_rating_column_idxs = [] custom_rating_labels = [] custom_date_counters = [] custom_date_column_idxs = [] custom_date_labels = [] custom_column_data = self.db.custom_column_num_map # print(str(custom_column_data)) for custom_column in custom_column_data.values(): # print(str(custom_column)) if custom_column['datatype'] == 'rating': custom_rating_counters.append({ 1: 0, 2: 0, 3: 0, 4: 0, 5: 0}) custom_rating_column_idxs.append(self.db.FIELD_MAP[custom_column['num']]) custom_rating_labels.append(custom_column['name']) elif custom_column['datatype'] == 'datetime': custom_date_counters.append({}) custom_date_column_idxs.append(self.db.FIELD_MAP[custom_column['num']]) custom_date_labels.append(custom_column['name']) for record in self.db.data: # for record in self.db.data.iterall(): # This would iterate over all books in the database. # Iterate over visible books. book_title = record[title_column_idx] tags = record[tags_column_idx] total_book_count = total_book_count + 1 known_genre_tag = False known_location_tag = False book_words = set() for book_word in book_title.lower().split(): book_word = string.strip(book_word, ":&-(),") if book_word: book_words.add(book_word) for book_word in book_words: self.increase_string_count(title_word_counter, book_word) # This became much slower when going from "in string" matching to regexps. Too slow? book_tag_list = [] if tags: book_tag_list = tags.lower().split(',') for genre in genres: genre_tag_regexp = genre['tags'] for tag in book_tag_list: if genre_tag_regexp.match(tag): genre['count'] = genre['count'] + 1 known_genre_tag = True break for location in locations: location_tag_regexp = location['tags'] for tag in book_tag_list: if location_tag_regexp.match(tag): location['count'] = location['count'] + 1 known_location_tag = True break if not known_genre_tag: unknown_genre_book_count = unknown_genre_book_count + 1 self.add_tags_to_counter(common_tags_on_unknown_genre, book_tag_list) if not known_location_tag: unknown_location_book_count = unknown_location_book_count + 1 self.add_tags_to_counter(common_tags_on_unknown_location, book_tag_list) pubdate_datetime = record[pubdate_column_idx] # pubdate_datetime = datetime.strptime(pubdate, "%Y-%m-%dT%H:%M:%s%z") # print(str(pubdate_datetime)) # The handling of UNDEFINED_DATE has timezone troubles... if pubdate_datetime.date() != UNDEFINED_DATE.date(): year = pubdate_datetime.year if year < 1000: print("Bad pubdate (" + str(pubdate_datetime) + ") for book " + book_title + " by " + str(record[self.db.FIELD_MAP['authors']])) print("UNDEFINED_DATE = " + str(UNDEFINED_DATE)) self.increase_number_count(year_histogram, year) else: unknown_year_book_count = unknown_year_book_count + 1 # Custom dates for (custom_date_counter, custom_date_column_idx) in zip(custom_date_counters, custom_date_column_idxs): # print(str(record)) custom_date = record[custom_date_column_idx] # print(str(custom_date)) if custom_date and custom_date.date() != UNDEFINED_DATE.date(): year = custom_date.year if year < 1000: print("Bad date (" + str(custom_date) + ") for book " + book_title + " by " + str(record[self.db.FIELD_MAP['authors']])) print("UNDEFINED_DATE = " + str(UNDEFINED_DATE)) self.increase_number_count(custom_date_counter, year) # Ratings rating = record[rating_column_idx] if rating: int_rating = self.ten_rating_to_five_rating(rating) rating_integer_histogram[int_rating] = rating_integer_histogram[int_rating] + 1 rating_exact_histogram[rating] = rating_exact_histogram[rating] + 1 # Custom ratings for (custom_rating_counter, custom_rating_column_idx) in zip(custom_rating_counters, custom_rating_column_idxs): # print(str(record)) rating = record[custom_rating_column_idx] # print(str(rating)) if rating: int_rating = self.ten_rating_to_five_rating(rating) custom_rating_counter[int_rating] = custom_rating_counter[int_rating] + 1 # Formats formats = record[format_column_idx] format_count = 0 if formats: format_list = formats.split(',') format_count = len(format_list) for format in format_list: self.increase_number_count(format_counter, format) self.increase_number_count(format_count_histogram, format_count) for over_generic_tag in over_generic_tags: if over_generic_tag in common_tags_on_unknown_genre: del common_tags_on_unknown_genre[over_generic_tag] if over_generic_tag in common_tags_on_unknown_location: del common_tags_on_unknown_location[over_generic_tag] common_strange_genre_tags = sorted(common_tags_on_unknown_genre.items(), key=itemgetter(1), reverse=True) # if common_strange_genre_tags: # print("\nCommon tags in books with unknown genre:") # for i in range(min(20, len(common_strange_genre_tags))): # (tag, count) = common_strange_genre_tags[i] # print(str(i + 1) + ". " + tag + " (" + str(count) + ")") # The output from this just lists common genres. Meaningless. # common_strange_location_tags = sorted(common_tags_on_unknown_location.items(), key=itemgetter(1), reverse=True) # print("\nCommon tags in books with unknown location:") # for i in range(min(20, len(common_strange_location_tags))): # (tag, count) = common_strange_location_tags[i] # print(str(i + 1) + ". " + tag + " (" + str(count) + ")") over_generic_book_title_words = ["the", "a", "of", "at", "in", "to", "on", "and", "for", "an", "from", "&", "-", "with", "is", "are", "was", "by"] for over_generic_book_title_word in over_generic_book_title_words: if over_generic_book_title_word in title_word_counter: del title_word_counter[over_generic_book_title_word] top_list_max_length = 30 common_title_words = self.truncate_top_list(sorted(title_word_counter.items(), key=itemgetter(1), reverse=True), top_list_max_length) # print("\nCommon title words:") # common_word_pos = 1 # for (common_title_word, common_title_word_count) in common_title_words: # print(str(common_word_pos) + ". " + common_title_word + " (" + str(common_title_word_count) + ")") # common_word_pos = common_word_pos + 1 # for year in sorted(year_histogram.keys()): # print(str(year) + " - " + str(year_histogram[year])) results = [] self.add_result_to_results(results, genres, unknown_genre_book_count, total_book_count, "Genre") self.add_histogram_to_results(results, year_histogram, unknown_year_book_count, "Published") self.add_histogram_to_results(results, rating_integer_histogram, 0, "Ratings", total_book_count) self.add_histogram_to_results(results, rating_exact_histogram, 0, "Exact ratings", total_book_count) self.add_top_list_to_results(results, common_title_words, "Common title words", "Word") self.add_top_list_to_results(results, self.truncate_top_list(common_strange_genre_tags, top_list_max_length), "Common 'strange' tags", "Tag") for (custom_rating_counter, custom_rating_label) in zip(custom_rating_counters, custom_rating_labels): self.add_histogram_to_results(results, custom_rating_counter, 0, custom_rating_label) for (custom_date_counter, custom_date_label) in zip(custom_date_counters, custom_date_labels): self.add_histogram_to_results(results, custom_date_counter, 0, custom_date_label) self.add_counter_to_results(results, format_counter.items(), total_book_count, "Formats") self.add_histogram_to_results(results, format_count_histogram, 0, "Formats/book", total_book_count) self.add_result_to_results(results, locations, unknown_location_book_count, total_book_count, "Location") dialog = ChartDialog(self.gui, self.icon, results) dialog.show()