Example #1
0
 def parse_buffer(self, undecoded_text_buffer, encoding=""):
     from ling_utils import span_tokenize
     tokens = span_tokenize(undecoded_text_buffer, encoding=encoding)
     from ling_utils import unify_word
     for token in tokens:
         token.token = unify_word(token.token)
     return tokens 
 def flush_buffer():
     log_out.write("flushing buffer..\n")
     log_out.flush()
     for token, codes in words_index[0].items():
         unified_token = unify_word(token.decode("windows-1251"))
         reducer_index = abs(hash(unified_token)) % len(reducers_pool)
         reducers_pool[reducer_index].write(unified_token + "\t" + " ".join(str(code) for code in codes) + "\n")
     words_index[0] = {}
     words_in_buffer[0] = 0
 def find_title(self, title_query):
     matched_objects = None
     for match in span_tokenize_windows1251(title_query):
         token = unify_word(match[-1].decode("windows-1251"))
         if not token in self.title_index:
             return []
         if matched_objects == None:
             matched_objects = set(self.title_index[token])
         else:
             matched_objects &= set(self.title_index[token])
         if not matched_objects:
             return []
     return matched_objects
    def find_mentions_of_author_and_title(self, query):
        tokens = [unify_word(match[-1].decode("windows-1251")) \
		    for match in span_tokenize_windows1251(query.encode("windows-1251"))[:10]]
        tokens = set(tokens)
        books_scores = {}
        for token in tokens:
            if token in self.title_index:
                for obj_id in set(self.title_index[token]):
                    books_scores.setdefault(obj_id, 0)
                    books_scores[obj_id] += 1
            if token in self.author_index: 
                for obj_id in set(self.author_index[token]):
                    books_scores.setdefault(obj_id, 0)
                    books_scores[obj_id] += 1
        import math
        min_match = math.ceil(len(tokens) * 0.6)
        matched_books = [(matched_tokens, book) for book, matched_tokens in books_scores.items() \
		                                      if matched_tokens >= min_match]
        matched_books.sort(reverse=True)
        matched_books = [book for _, book in matched_books]
        return matched_books
 def add_title(self, title, object_id):
     for match in span_tokenize_windows1251(title):
         token = unify_word(match[-1].decode("windows-1251"))
         self.title_index.setdefault(token, []).append(object_id)
def get_surname(author_str_windows1251):
    words = [unify_word(match[-1].decode("windows-1251")) for match in span_tokenize_windows1251(author_str_windows1251)]  
    if not words:
        return ""
    surname = max((len(word), word)  for word in words)[1]
    return surname