Python span_tokenize_windows1251 Exemples, ling_utils.span_tokenize_windows1251 Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : build_index_map.py Projet : mavlyutovrus/light_search

 def process_book(book_obj):
     obj_id = book_obj.object_id
     max_book_id[0] = max(int(obj_id), max_book_id[0])
     for field in book_obj.object_fields:
         field_id = field.field_id
         if field.field_file_path:
             field_value = open(field.field_file_path).read()
         else:
             field_value = field.field_value
         if not type(field_value) == str:
             print obj_id, ":", field_id, " - not a string value"
             continue
         tokens_matches = span_tokenize_windows1251(field_value)
         for first_match_index in xrange(0, len(tokens_matches), SEGMENT_SIZE):
             last_match_index = min(first_match_index + SEGMENT_SIZE - 1, len(tokens_matches) - 1)
             first_match = tokens_matches[first_match_index]
             last_match = tokens_matches[last_match_index]
             segment_start = first_match[0]
             segment_length = last_match[0] + last_match[1] - segment_start
             segment_id = segment_index_writer.add_segment(obj_id, field_id, segment_start, segment_length)
             for match_index in xrange(first_match_index, last_match_index + 1):
                 segment_match = tokens_matches[match_index]
                 segment_match_index = match_index - first_match_index
                 segment_token = segment_match[-1]
                 match_case = segment_match[-2]
                 word_case_weight = match_case == CASE_UPPER and 2 or match_case == CASE_TITLE and 1 or 0
                 code = match2code(segment_id, int(obj_id), segment_match_index, word_case_weight)
                 to_word_index(segment_token, code)
     books_counter.add()

Exemple #2

0

Afficher le fichier

Fichier : custom_fields_search_engine.py Projet : mavlyutovrus/light_search

 def find_title(self, title_query):
     matched_objects = None
     for match in span_tokenize_windows1251(title_query):
         token = unify_word(match[-1].decode("windows-1251"))
         if not token in self.title_index:
             return []
         if matched_objects == None:
             matched_objects = set(self.title_index[token])
         else:
             matched_objects &= set(self.title_index[token])
         if not matched_objects:
             return []
     return matched_objects

Exemple #3

0

Afficher le fichier

Fichier : custom_fields_search_engine.py Projet : mavlyutovrus/light_search

    def find_mentions_of_author_and_title(self, query):
        tokens = [unify_word(match[-1].decode("windows-1251")) \
		    for match in span_tokenize_windows1251(query.encode("windows-1251"))[:10]]
        tokens = set(tokens)
        books_scores = {}
        for token in tokens:
            if token in self.title_index:
                for obj_id in set(self.title_index[token]):
                    books_scores.setdefault(obj_id, 0)
                    books_scores[obj_id] += 1
            if token in self.author_index: 
                for obj_id in set(self.author_index[token]):
                    books_scores.setdefault(obj_id, 0)
                    books_scores[obj_id] += 1
        import math
        min_match = math.ceil(len(tokens) * 0.6)
        matched_books = [(matched_tokens, book) for book, matched_tokens in books_scores.items() \
		                                      if matched_tokens >= min_match]
        matched_books.sort(reverse=True)
        matched_books = [book for _, book in matched_books]
        return matched_books

Exemple #4

0

Afficher le fichier

Fichier : custom_fields_search_engine.py Projet : mavlyutovrus/light_search

 def add_title(self, title, object_id):
     for match in span_tokenize_windows1251(title):
         token = unify_word(match[-1].decode("windows-1251"))
         self.title_index.setdefault(token, []).append(object_id)

Exemple #5

0

Afficher le fichier

Fichier : custom_fields_search_engine.py Projet : mavlyutovrus/light_search

def get_surname(author_str_windows1251):
    words = [unify_word(match[-1].decode("windows-1251")) for match in span_tokenize_windows1251(author_str_windows1251)]  
    if not words:
        return ""
    surname = max((len(word), word)  for word in words)[1]
    return surname