Python Dictionary.add_dfs Exemples

Langage de programmation: Python

Espace de nommage/Pack: dictionary

Class/Type: Dictionary

Méthode/Fonction: add_dfs

Exemples au hotexamples.com: 1

Python Dictionary.add_dfs - 1 exemples trouvés. Ce sont les exemples réels les mieux notés de dictionary.Dictionary.add_dfs extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Méthodes fréquemment utilisées

Afficher Cacher

Dictionary(30)

add_term(12)

add(12)

encode_brief(7)

check(6)

add_word(5)

add_pad_token(5)

add_unk_token(5)

delete(4)

add_all(3)

accept_new(3)

doc_length(3)

build_dictionary(3)

delete_word(2)

add_new_term(2)

add_normalised_doc_length(2)

close(2)

all_docs(2)

add_single_word2dic(2)

add_start_token(2)

all_terms(2)

add_symbol(2)

create_default(2)

database_exists(2)

bos(2)

add_items(2)

add_documents(2)

add_doc_count(2)

encode_line(2)

entries(2)

open(2)

doc_to_bag_of_words(1)

is_in_dict(1)

setup(1)

confirm_multiple_words(1)

contains(1)

correct(1)

search_words(1)

search_anagrams(1)

definition(1)

has_word(1)

init_dict(1)

definitions(1)

doc2bow(1)

getPossibleWords(1)

getIDF(1)

getDefs(1)

getAllTFIDFV(1)

examples(1)

dict_learn(1)

Méthodes fréquemment utilisées

Dictionary (30)

add_term (12)

add (12)

encode_brief (7)

check (6)

add_word (5)

add_pad_token (5)

add_unk_token (5)

delete (4)

add_all (3)

Méthodes fréquemment utilisées

accept_new (3)

doc_length (3)

build_dictionary (3)

delete_word (2)

add_new_term (2)

add_normalised_doc_length (2)

close (2)

all_docs (2)

add_single_word2dic (2)

add_start_token (2)

all_terms (2)

add_symbol (2)

create_default (2)

database_exists (2)

bos (2)

add_items (2)

add_documents (2)

add_doc_count (2)

encode_line (2)

entries (2)

Méthodes fréquemment utilisées

all_terms (2)

add_symbol (2)

create_default (2)

database_exists (2)

bos (2)

add_items (2)

add_documents (2)

add_doc_count (2)

encode_line (2)

entries (2)

open (2)

doc_to_bag_of_words (1)

is_in_dict (1)

setup (1)

confirm_multiple_words (1)

contains (1)

correct (1)

search_words (1)

search_anagrams (1)

definition (1)

has_word (1)

init_dict (1)

definitions (1)

doc2bow (1)

getPossibleWords (1)

getIDF (1)

getDefs (1)

getAllTFIDFV (1)

examples (1)

dict_learn (1)

Méthodes fréquemment utilisées

open (2)

doc_to_bag_of_words (1)

is_in_dict (1)

setup (1)

confirm_multiple_words (1)

contains (1)

correct (1)

search_words (1)

search_anagrams (1)

definition (1)

has_word (1)

init_dict (1)

definitions (1)

doc2bow (1)

getPossibleWords (1)

getIDF (1)

getDefs (1)

getAllTFIDFV (1)

examples (1)

dict_learn (1)

encode (1)

clear (1)

BinarySearch (1)

classify (1)

add_doc_len (1)

GetDictionaryNames (1)

New (1)

Search (1)

SequentialSearch (1)

_debug (1)

_definitions (1)

_parse_csv (1)

addCommentsToDict (1)

addDocuments (1)

addToDictionary (1)

addWord (1)

add_cfs (1)

add_court_weight (1)

add_dfs (1)

add_doc_length (1)

Exemple #1

0

Afficher le fichier

class Indexer(object): def __init__(self, preprocess=None, tokenize=None): self.dictionary = None self.postings = None self.positions = None self.byte_repr = b"" self.repr_ptrs = [] self.vocabulary = None self.dfs = None if preprocess: self.__preprocess = preprocess if tokenize: self.__tokenize = tokenize def __tokenize(self, document): return nltk.word_tokenize(document) def __preprocess(self, tokens): return [token.lower() for token in tokens] def __build_postings(self, doc_tokens): """ Builds postings from document tokens. :param doc_tokens: list of document tokens, where each element is a list of tokens in a document :return: post_dict: maps term -> (list of postings, list of term frequency) df_dict: maps term -> document frequency doc_len_dict: maps docID -> document vector norm """ post_dict = {} tf_dict = {} pos_dict = {} df_dict = {} cf_dict = {} doc_len_dict = {} count = 0 # get tf information for doc in doc_tokens: doc_id, term_list = doc tf_dict[doc_id] = {} pos_dict[doc_id] = {} doc_len_dict[doc_id] = 0 # tfs for pos, term in enumerate(term_list): if term in cf_dict: cf_dict[term] += 1 else: cf_dict[term] = 0 tf_dict[doc_id][term] = tf_dict[doc_id].get(term, 0) + 1 orig_pos = pos_dict[doc_id].get(term, []) orig_pos.append(pos) pos_dict[doc_id][term] = orig_pos # postings # if postings list is empty or does not contain this doc_id if term not in post_dict: post_dict[term] = [doc_id] elif post_dict[term][-1] != doc_id: post_dict[term].append(doc_id) count += 1 print("Building postings for doc {} out of {}...".format( count, len(doc_tokens))) # build vocabulary self.vocabulary = [term for term in post_dict.keys()] # restructure tf for each term in each documents for term in self.vocabulary: # calculate df df_dict[term] = len(post_dict[term]) # get tf tfs = [] for doc_id in post_dict[term]: tfidf = 1 + math.log(tf_dict[doc_id][term], 10) tfs.append(tf_dict[doc_id][term]) doc_len_dict[doc_id] += tfidf * tfidf post_dict[term] = (post_dict[term], tfs) for key, value in doc_len_dict.items(): doc_len_dict[key] = math.sqrt(value) return post_dict, df_dict, doc_len_dict, pos_dict, cf_dict def __save_byte_repr(self, postings_path, encoding_length=3): """ Construct the byte representation of this index. Entries are stored consecutively, and each entry is formatted as follows: [length of entry (number of items), term_id, length of postings (integer count), postings: docID1, docID2, ..., tf1, tf2 ..., positions: length1, pos1_1, pos1_2, ..., length2, pos2_1, ...] :param encoding_length: integer encoding length :return: nothing """ def to_byte_rep(x): return x.to_bytes(encoding_length, byteorder="little") file_ptr = 0 ptrs = {} with open(postings_path, "wb") as f: for term_id, token in enumerate(self.vocabulary): ptrs[term_id] = file_ptr doc_ids, tfs = self.postings[token] # construct byte representation of each entry # calculate length of positions len_positions = 0 for doc_id in doc_ids: len_positions += len(self.positions[doc_id][token]) + 1 # length of entry f.write(to_byte_rep(len(doc_ids) * 2 + 3 + len_positions)) # term_id f.write(to_byte_rep(term_id)) # length of postings f.write(to_byte_rep(len(doc_ids))) # postings for doc_id in doc_ids: f.write(to_byte_rep(doc_id)) # tfs for tf in tfs: f.write(to_byte_rep(tf)) # positions for doc_id in doc_ids: pos_list = self.positions[doc_id][token] f.write(to_byte_rep(len(pos_list))) for pos in pos_list: f.write(to_byte_rep(pos)) file_ptr += (len(doc_ids) * 2 + 3 + len_positions) * encoding_length self.repr_ptrs = ptrs def index(self, input_file): """ Construct the reverse index of all files in input_file_dir, applicable to Reuters Corpus, where file names are the corresponding docIDs :param input_file: the path to the directory containing the files to index :return: nothing """ # get training file names doc_ids = [] # construct index document_tokens = [] csv_data = pd.read_csv(input_file) # for i in range(csv_data.shape[0]): for i in range(10): document = csv_data.iloc[i] doc_id = int(document.document_id) doc_ids.append(doc_id) # read file contents tokens = [] tokens += self.__preprocess(self.__tokenize(document.content)) document_tokens.append((doc_id, tokens)) print("Processing {} out of {} lines...".format( i + 1, csv_data.shape[0])) # build postings from (docID, token) pairs postings_dict, df_dict, doc_len_dict, pos_dict, cf_dict = self.__build_postings( document_tokens) self.vocabulary = sorted(postings_dict.keys()) self.postings = postings_dict self.positions = pos_dict self.dfs = df_dict self.dictionary = Dictionary(self.vocabulary, doc_ids) self.dictionary.add_dfs(self.dfs) self.dictionary.add_doc_len(doc_len_dict) self.dictionary.add_cfs(cf_dict) def save(self, postings_path, dictionary_path): """ Saves the byte representation of postings and dictionary to file. :param postings_path: the path of postings file :param dictionary_path: the path of dictionary file :return: nothing """ # construct byte representation for postings self.__save_byte_repr(postings_path) self.dictionary.add_pointers(self.repr_ptrs) self.dictionary.save(dictionary_path)