Python Postings.save Exemples

Langage de programmation: Python

Espace de nommage/Pack: postings

Class/Type: Postings

Méthode/Fonction: save

Exemples au hotexamples.com: 4

Python Postings.save - 4 exemples trouvés. Ce sont les exemples réels les mieux notés de postings.Postings.save extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Méthodes fréquemment utilisées

Afficher Cacher

Postings(11)

get(5)

list_at_offset(4)

add_doc_id(4)

save(4)

add_new_term(2)

not_list(2)

save_to_file(2)

add_docId_tf_to_offset(1)

add_docId_to_offset(1)

add_document_for_token(1)

compute_tf_idf(1)

add_doc(1)

has_doc_id(1)

increment_tf(1)

latest(1)

load_list(1)

Méthodes fréquemment utilisées

Postings (11)

get (5)

list_at_offset (4)

add_doc_id (4)

save (4)

add_new_term (2)

not_list (2)

save_to_file (2)

add_docId_tf_to_offset (1)

add_docId_to_offset (1)

Méthodes fréquemment utilisées

add_document_for_token (1)

compute_tf_idf (1)

add_doc (1)

has_doc_id (1)

increment_tf (1)

latest (1)

load_list (1)

Exemple #1

0

Afficher le fichier

def build_index(training_data_dir, dictionary_file, postings_file, is_debug): training_files = sorted(os.listdir(training_data_dir), key=lambda x: int(x)) if is_debug: training_files = training_files[:DEBUG_LIMIT] dictionary = Dictionary(dictionary_file) postings = Postings(postings_file) for training_file in training_files: doc_id = int(training_file) doc_path = osp.join(training_data_dir, training_file) postings.not_list().add(doc_id) add_doc_to_index(doc_id, doc_path, dictionary, postings) postings.save() # turn line nos to byte offsets f = open(postings_file) current_line = 1 f.readline() # skip postings list containing all doc ids while True: term = dictionary.term_for_offset(current_line) dictionary.add_term(term, f.tell()) line = f.readline() if not line: break current_line += 1 dictionary.save()

Exemple #2

0

Afficher le fichier

Fichier : build_index.py Projet : racheltanxueqi/CS3245

def build_index(training_data_dir, dictionary_file, postings_file, is_debug): training_files = sorted(os.listdir(training_data_dir), key=lambda x: x) if is_debug: training_files = training_files[:DEBUG_LIMIT] dictionary = Dictionary(dictionary_file) postings = Postings(postings_file) for training_file in training_files: doc_id = training_file doc_path = osp.join(training_data_dir, training_file) add_doc_to_index(doc_id, doc_path, dictionary, postings) postings.save() # turn line nos to byte offsets f = open(postings_file) current_line = 0 while True: term = dictionary.term_for_offset(current_line) dictionary.add_term(term, f.tell(), update_freq=False) line = f.readline() if not line: break current_line += 1 dictionary.generate_idf(len(training_files)) dictionary.save()

Exemple #3

0

Afficher le fichier

def build_index(directory, dictionary_file, postings_file): files = os.listdir(directory) dictionary = Dictionary(dictionary_file) postings = Postings(postings_file) stemmer = nltk.stem.porter.PorterStemmer() last = '' for doc_id in files: tf_list = {} line_number = 1 offset = 0 # Use linecache to get line line = linecache.getline(os.path.join(directory, doc_id), line_number) while line != '': # tokenize lines into sentences sentences = nltk.sent_tokenize(line) for sentence in sentences: # tokenize sentence tokens = nltk.word_tokenize(sentence) for token in tokens: # apply stemming and case folding stemmed_token = stemmer.stem(token).lower() # if term already exists in dictionary, we find row number if dictionary.has_term(stemmed_token): offset = dictionary.get_offset(stemmed_token) # If postings for that term already has doc id, # then increment tf, # Else increment df and add the doc id if postings.has_doc_id(doc_id, offset): postings.increment_tf(doc_id, offset) else: dictionary.increment_df(stemmed_token) postings.add_doc_id(doc_id, offset) # else, we add it to dictionary and postings else: offset = postings.add_new_term() postings.add_doc_id(doc_id, offset) dictionary.add_new_term(stemmed_token, offset) #Keep track of tf values of all terms in doc if stemmed_token in tf_list: tf_list[stemmed_token] += 1 else: tf_list[stemmed_token] = 1 line_number += 1 line = linecache.getline(os.path.join(directory, doc_id), line_number) # Store doc length dictionary.add_doc_length(doc_id, tf_list.values()) # save data postings.save(dictionary) dictionary.save()

Exemple #4

0

Afficher le fichier

Fichier : index.py Projet : advaypal/CS3245

def build_index(directory, dictionary_file, postings_file): files = os.listdir(directory) dictionary = Dictionary(dictionary_file) postings = Postings(postings_file) stemmer = nltk.stem.porter.PorterStemmer() last = '' for doc_id in files: postings.add_doc(doc_id) line_number = 1 #Use linecache to get line line = linecache.getline(os.path.join(directory, doc_id), line_number) while line != '': #tokenize lines into sentences sentences = nltk.sent_tokenize(line) for sentence in sentences: #tokenize sentence tokens = nltk.word_tokenize(sentence) for token in tokens: #apply stemming and case folding stemmed_token = stemmer.stem(token).lower() #if term alraeady exists in dictionary, we find row number if dictionary.has_term(stemmed_token): offset = dictionary.get_offset(stemmed_token) result = postings.add_doc_id(doc_id, offset) # Result indicates if the doc id is new if result: dictionary.increment_frequency(stemmed_token) #else, we add it to dictionary and postings else: offset = postings.add_new_term() postings.add_doc_id(doc_id, offset) dictionary.add_new_term(stemmed_token, offset) line_number += 1 line = linecache.getline(os.path.join(directory, doc_id), line_number) #save data postings.save(dictionary) dictionary.save()