Esempio n. 1
0
def _c_convert_timestamp(val):
    if not val:
        return None
    try:
        ret = _c_speedup.parse_date(val.strip())
    except:
        ret = None
    if ret is None:
        return parse_date(val, as_utc=False)
    year, month, day, hour, minutes, seconds, tzsecs = ret
    try:
        return datetime(year, month, day, hour, minutes, seconds, tzinfo=tzoffset(None, tzsecs)).astimezone(local_tz)
    except OverflowError:
        return UNDEFINED_DATE.astimezone(local_tz)
Esempio n. 2
0
def _c_convert_timestamp(val):
    if not val:
        return None
    try:
        ret = _c_speedup.parse_date(val.strip())
    except:
        ret = None
    if ret is None:
        return parse_date(val, as_utc=False)
    year, month, day, hour, minutes, seconds, tzsecs = ret
    try:
        return datetime(year, month, day, hour, minutes, seconds,
                tzinfo=tzoffset(None, tzsecs)).astimezone(local_tz)
    except OverflowError:
        return UNDEFINED_DATE.astimezone(local_tz)
Esempio n. 3
0
    def count_genres(self):
        ''' Count the genres and list the most common ones. '''

        genres = [
            {'label':"Science Fiction", 'tags':["science fiction*", "scifi*", "science-fiction*", "space*", "*other planet*", "sagas"]},
            {'label':"Fantasy",      	'tags':["fantasy*", "magic*"]},
            {'label':"Adventure",	'tags':["*adventure*", "*pirates*"]},
            {'label':"Thriller",	'tags':["*thriller*", "suspense*", "psychological*", "espionage*"]},
            {'label':"Mystery",		'tags':["mystery*", "*detective*", "*sleuth*", "murder*"]},
            {'label':"Romance",		'tags':["*romance*", "love*", "*romantic*"]},
            {'label':"Historical",	'tags':["*historical*"]},
            {'label':"Humour",		'tags':["*humour*", "*humorous*", "*parody*", "*satire*", "*satirical*", "humor*"]},
            {'label':"Criminal",	'tags':["Criminal*", "Police*", "hard-boiled*", "crime"]},
            {'label':"Military",	'tags':["*military*", "*war*"]},
            {'label':"Erotica",		'tags':["*erotica*", "anal", "bdsm", "sex"]},
            {'label':"Religion",	'tags':["*religio*", "*christianity*", "*islam*", "*muslim*", "*buddhism*", "*hinduism*", "*catholi*", "*protestantism*"]},
            {'label':"Horror",		'tags':["*horror*", "*fear*", "*zombies*"]},
            {'label':"Classics",	'tags':["classics", "literature -classics"]},
            {'label':"Juvenile",	'tags':["*juvenile*", "*children's*"]},
            {'label':"Non Fiction",	'tags':["biography*", "*non fiction*", "*memoirs*", "*business*econom*", "travel", "computers*", "finance", "mathematics", "physics", "zoology", "programming*", "social science*", "political science*", "medical", "usenet", "reference*", "science", "*non-fiction*", "language arts*", "philosophy", "*edcuation*", "*nonfiction*"]},
            ]

        locations = [
            {'label':"America", 'tags':["*america*", "*usa*", "*canad*", "*mexico*", "*brazil*", "*new york*", "NY", "*california*", "boston", "*los angeles*", "la", "*massachusetts*", "*texas*", "*north carolina*", "westerns", "*chicago*", "*florida*", "*maine*", "*alaska*", "n.y.", "seattle*", "new england", "new jersey", "*manhattan*", "*illinois*", "minnesota", "*new orleans*", "united states*", "montana", "las vegas*"]},
            {'label':"Europe",	'tags':["*europe*", "*sweden*", "*germany*", "*britain*", "*france*", "*italy*", "*ireland*", "*spain*", "*portugal*", "*poland*", "*russia*", "london", "rome", "english*", "*scotland*", "ireland", "england", "paris", "soviet*", "wales", "greece", "*(wales)"]},
            {'label':"Africa",	'tags':["*africa*", "*egypt*"]},
            {'label':"Asia",	'tags':["*asia*", "*japan*", "*china*", "*hongkong*", "*singapore*", "*india*", "*iraq*", "*iran*", "*middle east*", "*far east*", "*vietnam*", "*pakistan*"]},
            {'label':"Oceania",	'tags':["*ocenania*", "*australia*", "*new zeeland*"]},
            ]

        over_generic_tags = ["fiction", "general", "literary", "fiction - general", "ebook", "book", "general & literary fiction", "essays", "general fiction", "fiction & literature", "popular literature"]

        for genre in genres:
            # Change globs to regexps.
            genre['tags'] = self.tag_list_to_regexp(genre['tags'])
            genre['count'] = 0
        
        for location in locations:
            # Change globs to regexps.
            location['tags'] = self.tag_list_to_regexp(location['tags'])
            location['count'] = 0

#        print(self.db.FIELD_MAP)
        tags_column_idx = self.db.FIELD_MAP['tags']
        pubdate_column_idx = self.db.FIELD_MAP['pubdate']
        title_column_idx = self.db.FIELD_MAP['title']
        rating_column_idx = self.db.FIELD_MAP['rating']
        format_column_idx = self.db.FIELD_MAP['formats']
        
#        labels = ["Total", "Unknown", "Science Fiction", "Fantasy", "Adventure", "Thriller", "Mystery", "Romance"]
#        counts = [0, 0, 0, 0, 0, 0, 0, 0]; # Total, None/Other, Science Fiction, Fantasy, Adventure, Thriller, Mystery, Romance
        total_book_count = 0
        unknown_genre_book_count = 0
        unknown_location_book_count = 0
        common_tags_on_unknown_genre = {}
        common_tags_on_unknown_location = {}
        year_histogram = {}
        unknown_year_book_count = 0
        rating_integer_histogram = { 1: 0, 2: 0, 3: 0, 4: 0, 5: 0 }
        rating_exact_histogram = { 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0 }
        title_word_counter = {}
        format_count_histogram = { 0: 0, 1: 0, 2: 0 }
        format_counter = {}
        custom_rating_counters = []
        custom_rating_column_idxs = []
        custom_rating_labels = []
        custom_date_counters = []
        custom_date_column_idxs = []
        custom_date_labels = []
        
        custom_column_data = self.db.custom_column_num_map
#        print(str(custom_column_data))
        for custom_column in custom_column_data.values():
#            print(str(custom_column))
            if custom_column['datatype'] == 'rating':
                custom_rating_counters.append({ 1: 0, 2: 0, 3: 0, 4: 0, 5: 0})
                custom_rating_column_idxs.append(self.db.FIELD_MAP[custom_column['num']])
                custom_rating_labels.append(custom_column['name'])
            elif custom_column['datatype'] == 'datetime':
                custom_date_counters.append({})
                custom_date_column_idxs.append(self.db.FIELD_MAP[custom_column['num']])
                custom_date_labels.append(custom_column['name'])
            
        for record in self.db.data:
#        for record in self.db.data.iterall(): # This would iterate over all books in the database.
            # Iterate over visible books.
            book_title = record[title_column_idx]
            tags = record[tags_column_idx]
            total_book_count = total_book_count + 1
            known_genre_tag = False
            known_location_tag = False

            book_words = set()
            for book_word in book_title.lower().split():
                book_word = string.strip(book_word, ":&-(),")
                if book_word:
                    book_words.add(book_word)

            for book_word in book_words:
                self.increase_string_count(title_word_counter, book_word)
            
            # This became much slower when going from "in string" matching to regexps. Too slow?
            book_tag_list = []
            if tags:
                book_tag_list = tags.lower().split(',')
                for genre in genres:
                    genre_tag_regexp = genre['tags']
                    for tag in book_tag_list:
                        if genre_tag_regexp.match(tag):
                            genre['count'] = genre['count'] + 1
                            known_genre_tag = True
                            break
                    
                for location in locations:
                    location_tag_regexp = location['tags']
                    for tag in book_tag_list:
                        if location_tag_regexp.match(tag):
                            location['count'] = location['count'] + 1
                            known_location_tag = True
                            break

            if not known_genre_tag:
                unknown_genre_book_count = unknown_genre_book_count + 1
                self.add_tags_to_counter(common_tags_on_unknown_genre, book_tag_list)

            if not known_location_tag:
                unknown_location_book_count = unknown_location_book_count + 1
                self.add_tags_to_counter(common_tags_on_unknown_location, book_tag_list)


            pubdate_datetime = record[pubdate_column_idx]
#                pubdate_datetime = datetime.strptime(pubdate, "%Y-%m-%dT%H:%M:%s%z")
#                print(str(pubdate_datetime))
            # The handling of UNDEFINED_DATE has timezone troubles...
            if pubdate_datetime.date() != UNDEFINED_DATE.date():
                year = pubdate_datetime.year
                if year < 1000:
                    print("Bad pubdate (" + str(pubdate_datetime) + ") for book " + book_title + " by " + str(record[self.db.FIELD_MAP['authors']]))
                    print("UNDEFINED_DATE = " + str(UNDEFINED_DATE))
                self.increase_number_count(year_histogram, year)
            else:
                unknown_year_book_count = unknown_year_book_count + 1

            # Custom dates
            for (custom_date_counter, custom_date_column_idx) in zip(custom_date_counters, custom_date_column_idxs):
#                print(str(record))
                custom_date = record[custom_date_column_idx]
#                print(str(custom_date))
                if custom_date and custom_date.date() != UNDEFINED_DATE.date():
                    year = custom_date.year
                    if year < 1000:
                        print("Bad date (" + str(custom_date) + ") for book " + book_title + " by " + str(record[self.db.FIELD_MAP['authors']]))
                        print("UNDEFINED_DATE = " + str(UNDEFINED_DATE))
                    self.increase_number_count(custom_date_counter, year)

            # Ratings
            rating = record[rating_column_idx]
            if rating:
                int_rating = self.ten_rating_to_five_rating(rating)
                rating_integer_histogram[int_rating] = rating_integer_histogram[int_rating] + 1
                rating_exact_histogram[rating] = rating_exact_histogram[rating] + 1

            # Custom ratings
            for (custom_rating_counter, custom_rating_column_idx) in zip(custom_rating_counters, custom_rating_column_idxs):
#                print(str(record))
                rating = record[custom_rating_column_idx]
#                print(str(rating))
                if rating:
                    int_rating = self.ten_rating_to_five_rating(rating)
                    custom_rating_counter[int_rating] = custom_rating_counter[int_rating] + 1

            # Formats
            formats = record[format_column_idx]
            format_count = 0
            if formats:
                format_list = formats.split(',')
                format_count = len(format_list)
                for format in format_list:
                    self.increase_number_count(format_counter, format)
            self.increase_number_count(format_count_histogram, format_count)
            
            
        for over_generic_tag in over_generic_tags:
            if over_generic_tag in common_tags_on_unknown_genre:
                del common_tags_on_unknown_genre[over_generic_tag]
            if over_generic_tag in common_tags_on_unknown_location:
                del common_tags_on_unknown_location[over_generic_tag]

        common_strange_genre_tags = sorted(common_tags_on_unknown_genre.items(), key=itemgetter(1), reverse=True)
        # if common_strange_genre_tags:
        #     print("\nCommon tags in books with unknown genre:")
        #     for i in range(min(20, len(common_strange_genre_tags))):
        #         (tag, count) = common_strange_genre_tags[i]
        #         print(str(i + 1) + ". " + tag + " (" + str(count) + ")")

        # The output from this just lists common genres. Meaningless.
        # common_strange_location_tags = sorted(common_tags_on_unknown_location.items(), key=itemgetter(1), reverse=True)
        # print("\nCommon tags in books with unknown location:")
        # for i in range(min(20, len(common_strange_location_tags))):
        #     (tag, count) = common_strange_location_tags[i]
        #     print(str(i + 1) + ". " + tag + " (" + str(count) + ")")

        over_generic_book_title_words = ["the", "a", "of", "at", "in", "to", "on", "and", "for", "an", "from", "&", "-", "with", "is", "are", "was", "by"]
        for over_generic_book_title_word in over_generic_book_title_words:
            if over_generic_book_title_word in title_word_counter:
                del title_word_counter[over_generic_book_title_word]

        top_list_max_length = 30
        common_title_words = self.truncate_top_list(sorted(title_word_counter.items(), key=itemgetter(1), reverse=True), top_list_max_length)
        # print("\nCommon title words:")
        # common_word_pos = 1
        # for (common_title_word, common_title_word_count) in common_title_words:
        #     print(str(common_word_pos) + ". " + common_title_word + " (" + str(common_title_word_count) + ")")
        #     common_word_pos = common_word_pos + 1
                
        # for year in sorted(year_histogram.keys()):
        #     print(str(year) + " - " + str(year_histogram[year]))
            
        results = []
        self.add_result_to_results(results, genres, unknown_genre_book_count, total_book_count, "Genre")
        self.add_histogram_to_results(results, year_histogram, unknown_year_book_count, "Published")
        self.add_histogram_to_results(results, rating_integer_histogram, 0, "Ratings", total_book_count)
        self.add_histogram_to_results(results, rating_exact_histogram, 0, "Exact ratings", total_book_count)
        self.add_top_list_to_results(results, common_title_words, "Common title words", "Word")
        self.add_top_list_to_results(results, self.truncate_top_list(common_strange_genre_tags, top_list_max_length), "Common 'strange' tags", "Tag")
        for (custom_rating_counter, custom_rating_label) in zip(custom_rating_counters, custom_rating_labels):
            self.add_histogram_to_results(results, custom_rating_counter, 0, custom_rating_label)
        for (custom_date_counter, custom_date_label) in zip(custom_date_counters, custom_date_labels):
            self.add_histogram_to_results(results, custom_date_counter, 0, custom_date_label)
        self.add_counter_to_results(results, format_counter.items(), total_book_count, "Formats")
        self.add_histogram_to_results(results, format_count_histogram, 0, "Formats/book", total_book_count)
        self.add_result_to_results(results, locations, unknown_location_book_count, total_book_count, "Location")
        
        dialog = ChartDialog(self.gui, self.icon, results)
        dialog.show()