Ejemplos de RecordDAWG.load en Python

Lenguaje de programación: Python

Namespace/Package Name: dawg

Clase / Tipo: RecordDAWG

Método / Función: load

Ejemplos en hotexamples.com: 3

Python RecordDAWG.load - 3 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de dawg.RecordDAWG.load extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Métodos usados con frecuencia

Mostrar Ocultar

RecordDAWG(8)

load(3)

save(2)

write(2)

keys(1)

Métodos usados con frecuencia

RecordDAWG (8)

load (3)

save (2)

write (2)

keys (1)

Ejemplo n.º 1

Mostrar archivo

Archivo: area_utils.py Proyecto: Strongc/python_utils

def load_dict(path):
    format = ">2I"
    try:
        d = RecordDAWG(format)
        d.load(path)
        return d
    except Exception, e:
        print "load dict error:..", e.message
        return None

Ejemplo n.º 2

Mostrar archivo

Archivo: area_code.py Proyecto: Strongc/python_utils

class AreaCode(object):
    def __init__(self, dict_path):
        reload(sys)
        sys.setdefaultencoding('utf8')
        self.dict_path = dict_path
        self.format = ">2I"
        try:
            self.dict = RecordDAWG(self.format)
            self.dict.load(dict_path)
        except Exception, e:
            print "load dict error:",dict_path, e.message

Ejemplo n.º 3

Mostrar archivo

Archivo: SearchEngine.py Proyecto: antonia69/InformationRetrieval-en.people.cn

class SearchEngine():
    def __init__(self):
        self.seek_list = None
        self.comment_file = None
        self.index_file = None
        self.symbol_to_encoding_dict = None
        self.cids = None
        self.comment_offsets_cid = None
        self.comment_offsets = None
        self.comment_term_counts = None
        self.comment_csv_reader = None
        self.authors_list = None
        self.articles_list = None
        self.reply_to_index = None
        self.collection_term_count = 0
        self.stemmer = Stemmer.Stemmer('english')
        self.tokenizer = nltk.tokenize.ToktokTokenizer()
        self.report = Report()

    def load_index(self, directory):
        self.seek_list = RecordDAWG('>QQ')
        self.seek_list.load(f'{directory}/compressed_seek_list.dawg')
        self.index_file = open(f'{directory}/compressed_index', mode='rb')
        with open(f'{directory}/symbol_to_encoding_dict.pickle',
                  mode='rb') as f:
            self.symbol_to_encoding_dict = pickle.load(f)
        self.comment_offsets = numpy.load(
            f'{directory}/comment_offsets.npy', mmap_mode=None)
        self.comment_term_counts = numpy.load(
            f'{directory}/comment_term_counts.npy', mmap_mode=None)
        with open(f'{directory}/collection_term_count.pickle', mode='rb') as f:
            self.collection_term_count = pickle.load(f)
        self.comment_file = open(f'{directory}/comments.csv', mode='rb')
        self.comment_csv_reader = csv.reader(
            binary_read_line_generator(self.comment_file))
        with open(f'{directory}/authors_list.pickle', mode='rb') as f:
            self.authors_list = pickle.load(f)
        with open(f'{directory}/articles_list.pickle', mode='rb') as f:
            self.articles_list = pickle.load(f)
        with open(f'{directory}/reply_to_index.pickle', mode='rb') as f:
            self.reply_to_index = pickle.load(f)
        self.cids = numpy.load(f'{directory}/cids.npy', mmap_mode='r')
        self.comment_offsets_cid = numpy.load(
            f'{directory}/comment_offsets_cid.npy', mmap_mode='r')

    def load_posting_list_parts(self, stem):
        offset, size = self.seek_list[stem][0]
        self.index_file.seek(offset)
        binary_data = self.index_file.read(size)
        decoded_posting_list = Huffman.decode(
            binary_data, self.symbol_to_encoding_dict)
        return [stem] + decoded_posting_list.split(posting_list_separator)

    def get_comment_term_count(self, comment_offset):
        return self.comment_term_counts[numpy.searchsorted(
            self.comment_offsets, comment_offset)]

    def get_cid_to_offset(self, cid):
        return self.comment_offsets_cid[numpy.searchsorted(self.cids, cid)]

    # returns score based on natural language model with dirichlet smoothing
    # query_terms: list of query terms, stemmed and filtered
    # comment_offsets: list of offsets of comments into comment file
    def get_dirichlet_smoothed_score(self, query_terms, comment_offsets,
                                     mu=1500):
        ranked_comments = [[0, offset] for offset in comment_offsets]
        for query_term in query_terms:
            query_stem = self.stemmer.stemWord(query_term)
            if query_stem not in self.seek_list or \
                    self.seek_list[query_stem][0][1] > \
                    self.collection_term_count / 100:
                continue
            posting_list_parts = self.load_posting_list_parts(query_stem)
            query_term_count = int(posting_list_parts[1])
            comment_offsets_index = 0
            for comment_list in posting_list_parts[2:]:
                if comment_offsets_index >= len(comment_offsets):
                    break
                first_occurence = int(comment_list.partition(',')[0])
                len_occurrences = comment_list.count(',') + 1
                while (comment_offsets_index < len(comment_offsets)
                        and first_occurence >
                        comment_offsets[comment_offsets_index]):
                    # term not found -> 0 occurences in comment
                    ranked_comments[comment_offsets_index][0] += math.log(
                        (mu * query_term_count / self.collection_term_count)
                        / (self.get_comment_term_count(comment_offsets[
                            comment_offsets_index]) + mu))
                    comment_offsets_index += 1

                if(comment_offsets_index < len(comment_offsets)
                        and first_occurence ==
                        comment_offsets[comment_offsets_index]):
                    fD_query_term = len_occurrences - 1
                    ranked_comments[comment_offsets_index][0] += math.log(
                        (fD_query_term + (mu * query_term_count
                                          / self.collection_term_count))
                        / (self.get_comment_term_count(comment_offsets[
                            comment_offsets_index]) + mu))
                    comment_offsets_index += 1
            while comment_offsets_index < len(comment_offsets):
                # no matches found
                ranked_comments[comment_offsets_index][0] += math.log(
                    (mu * query_term_count / self.collection_term_count)
                    / (self.get_comment_term_count(comment_offsets[
                        comment_offsets_index]) + mu))
                comment_offsets_index += 1

        return ranked_comments

    # load comment from given offset into comment file
    def load_comment(self, offset):
        self.comment_file.seek(offset)
        comment_as_list = next(self.comment_csv_reader)
        comment = Comment()
        comment.cid = int(comment_as_list[0])
        # comment.article_url = self.articles_list[int(comment_as_list[1])]
        # comment.author = self.authors_list[int(comment_as_list[2])]
        comment.text = comment_as_list[3]
        # comment.timestamp = comment_as_list[4]
        # comment.parent_cid = int(comment_as_list[5]) \
        #    if comment_as_list[5] != '' else -1
        comment.upvotes = int(comment_as_list[6]) \
            if len(comment_as_list) >= 7 else 0
        comment.downvotes = int(comment_as_list[7]) \
            if len(comment_as_list) >= 8 else 0

        return comment

    def load_comment_from_cid(self, cid):
        return self.load_comment(self.get_cid_to_offset(cid))

    def load_cid_only(self, offset):
        self.comment_file.seek(offset)
        csv_line_start = self.comment_file.read(8)
        comma_position = csv_line_start.find(b',')
        while comma_position == -1:
            csv_line_start += self.comment_file.read(8)
            comma_position = csv_line_start.find(b',')
        return csv_line_start[:comma_position].decode()

    # returns offsets into comment file for all comments containing stem in
    # ascending order
    def get_offsets_for_stem(self, stem):
        if stem not in self.seek_list:
            return []
        posting_list_parts = self.load_posting_list_parts(stem)
        return [int(x.partition(',')[0]) for x in posting_list_parts[2:]]

    def phrase_query(self, phrase, suffix=''):
        if phrase == '' and suffix != '':
            # suffix of the phrase now becomes prefix for a prefix query
            return self.prefix_query(suffix)

        if ' ' not in phrase:
            offsets = self.keyword_query(phrase)
        else:
            stem_offset_size_list = []  # may contain duplicates!
            for sentence in nltk.tokenize.sent_tokenize(phrase):
                for token in self.tokenizer.tokenize(sentence):
                    stem = self.stemmer.stemWord(token)
                    if stem not in self.seek_list:
                        continue
                    stem_offset_size_list.append((stem, self.seek_list[stem]))

            if len(stem_offset_size_list) == 0:
                return []

            # sort by posting_list size
            stem_offset_size_list.sort(key=lambda t: t[1][0][1])
            smallest_stem = stem_offset_size_list[0][0]
            second_smallest_stem = stem_offset_size_list[1][0] \
                if len(stem_offset_size_list) >= 2 and \
                stem_offset_size_list[1][1][0][1] < \
                self.collection_term_count / 100 else ''
            offsets = self.get_offsets_for_stem(smallest_stem)
            if second_smallest_stem != '':
                offsets = set(offsets)
                offsets.intersection_update(
                    self.get_offsets_for_stem(second_smallest_stem))

        result = []
        phrase_to_check = phrase if suffix == '' else f'{phrase} {suffix}'
        for offset in offsets:
            comment = self.load_comment(offset)
            if phrase_to_check in comment.text.lower():
                result.append(offset)
        return result

    def prefix_query(self, prefix):
        stems_with_prefix = self.seek_list.keys(prefix)
        result = []
        for stem in stems_with_prefix:
            result.extend(self.get_offsets_for_stem(stem))
        return result

    def keyword_query(self, keyword):
        return self.get_offsets_for_stem(
            self.stemmer.stemWord(keyword))

    def reply_to_query(self, target_cid):
        return [self.cid_to_offset[cid]
                for cid in self.reply_to_index.get(target_cid, ())]

    def basic_search(self, token_node):
        # search for a single query token

        if token_node.kind == 'phrase_prefix':  # phrase prefix query: 'hi ye'*
            return self.phrase_query(
                token_node.phrase_start, token_node.prefix)
        elif token_node.kind == 'phrase':  # phrase query: 'european union'
            return self.phrase_query(token_node.phrase)
        elif token_node.kind == 'prefix':  # prefix query: isra*
            return self.prefix_query(token_node.prefix)
        elif token_node.kind == 'reply_to':  # ReplyTo query: ReplyTo:12345
            return self.reply_to_query(token_node.target_cid)
        elif token_node.kind == 'keyword':  # keyword query: merkel
            return self.keyword_query(token_node.keyword)
        else:
            raise RuntimeError(f'unknown token_node.kind: {token_node.kind}')

    def print_comments(self, offset_iterable, printIdsOnly=True):
        if printIdsOnly:
            print(','.join((self.load_cid_only(offset)
                            for offset in offset_iterable)))
        else:
            for offset in offset_iterable:
                comment = self.load_comment(offset)
                print(f'{comment.cid},{comment.text}')

    def search(self, query, top_k=None, printIdsOnly=True):
        print(f'\nsearching for "{query}":')

        query_tree_root = build_query_tree(query)
        if query_tree_root.is_boolean_query:
            or_result = set()
            with self.report.measure('searching'):
                for and_node in query_tree_root.children:
                    and_result = None
                    to_be_removed = []
                    for child in and_node.children:
                        child_result = self.basic_search(child)
                        if child.is_negated:
                            to_be_removed.append(child_result)
                        elif and_result is None:
                            and_result = set(child_result)
                        else:
                            and_result.intersection_update(child_result)
                    and_result.difference_update(*to_be_removed)
                    or_result.update(and_result)

            self.print_comments(or_result, printIdsOnly)
        else:  # non bool query
            with self.report.measure('searching'):
                children_results = (self.basic_search(child)
                                    for child in query_tree_root.children)
                comment_offsets = list(frozenset().union(*children_results))

            with self.report.measure('calculating scores'):
                # rated_comment is a tuple of (score, offset)
                rated_comments = self.get_dirichlet_smoothed_score(
                    query_tree_root.query_terms, comment_offsets)
                if top_k is not None and len(rated_comments) > top_k:
                    top_k_rated_comments = \
                        rated_comments[:top_k]
                    heapq.heapify(top_k_rated_comments)
                    for rated_comment in rated_comments[top_k:]:
                        heapq.heappushpop(top_k_rated_comments, rated_comment)
                    result = top_k_rated_comments
                else:
                    result = rated_comments

                result.sort(key=lambda x: x[0], reverse=True)

            self.print_comments(
                (offset for score, offset in result), printIdsOnly)