def process_book(book_obj): obj_id = book_obj.object_id max_book_id[0] = max(int(obj_id), max_book_id[0]) for field in book_obj.object_fields: field_id = field.field_id if field.field_file_path: field_value = open(field.field_file_path).read() else: field_value = field.field_value if not type(field_value) == str: print obj_id, ":", field_id, " - not a string value" continue tokens_matches = span_tokenize_windows1251(field_value) for first_match_index in xrange(0, len(tokens_matches), SEGMENT_SIZE): last_match_index = min(first_match_index + SEGMENT_SIZE - 1, len(tokens_matches) - 1) first_match = tokens_matches[first_match_index] last_match = tokens_matches[last_match_index] segment_start = first_match[0] segment_length = last_match[0] + last_match[1] - segment_start segment_id = segment_index_writer.add_segment(obj_id, field_id, segment_start, segment_length) for match_index in xrange(first_match_index, last_match_index + 1): segment_match = tokens_matches[match_index] segment_match_index = match_index - first_match_index segment_token = segment_match[-1] match_case = segment_match[-2] word_case_weight = match_case == CASE_UPPER and 2 or match_case == CASE_TITLE and 1 or 0 code = match2code(segment_id, int(obj_id), segment_match_index, word_case_weight) to_word_index(segment_token, code) books_counter.add()
def find_title(self, title_query): matched_objects = None for match in span_tokenize_windows1251(title_query): token = unify_word(match[-1].decode("windows-1251")) if not token in self.title_index: return [] if matched_objects == None: matched_objects = set(self.title_index[token]) else: matched_objects &= set(self.title_index[token]) if not matched_objects: return [] return matched_objects
def find_mentions_of_author_and_title(self, query): tokens = [unify_word(match[-1].decode("windows-1251")) \ for match in span_tokenize_windows1251(query.encode("windows-1251"))[:10]] tokens = set(tokens) books_scores = {} for token in tokens: if token in self.title_index: for obj_id in set(self.title_index[token]): books_scores.setdefault(obj_id, 0) books_scores[obj_id] += 1 if token in self.author_index: for obj_id in set(self.author_index[token]): books_scores.setdefault(obj_id, 0) books_scores[obj_id] += 1 import math min_match = math.ceil(len(tokens) * 0.6) matched_books = [(matched_tokens, book) for book, matched_tokens in books_scores.items() \ if matched_tokens >= min_match] matched_books.sort(reverse=True) matched_books = [book for _, book in matched_books] return matched_books
def add_title(self, title, object_id): for match in span_tokenize_windows1251(title): token = unify_word(match[-1].decode("windows-1251")) self.title_index.setdefault(token, []).append(object_id)
def get_surname(author_str_windows1251): words = [unify_word(match[-1].decode("windows-1251")) for match in span_tokenize_windows1251(author_str_windows1251)] if not words: return "" surname = max((len(word), word) for word in words)[1] return surname