def load_nfull_collection(dir, vocab_json='vocab.json', postings_json='postings.json'): with open(dir + '/' + vocab_json, 'r') as vocab_file: vocabulary = json.load(vocab_file) with open(dir + '/' + postings_json, 'r') as postings_file: postings = json.load(postings_file) return WordCollectionInfo(vocabulary, postings)
def get_biword_index(documents) -> WordCollectionInfo: vocabulary = dict() postings = list() all_words_counter = 0 unique_word_id = 0 for word_pair, doc_id in get_words_pair(documents): unique_word_id = add_word(word_pair, unique_word_id, doc_id, vocabulary, postings) all_words_counter += 1 return WordCollectionInfo(SortedDict(vocabulary), postings, documents, all_words_counter)
def load_collection(dir, vocab_json='vocabSER.txt', postings_json='postingsSER.txt', documents_json='documentsSER.txt'): with open(dir + '/' + vocab_json, 'rb') as vocab_file: vocabulary = pickle.load(vocab_file) with open(dir + '/' + postings_json, 'rb') as postings_file: postings = pickle.load(postings_file) with open(dir + '/' + documents_json, 'rb') as vocab_file: documents = pickle.load(vocab_file) return WordCollectionInfo(vocabulary, postings, documents)
def get_biword_index(documents) -> WordCollectionInfo: words_dict = {} all_words_counter = 0 for word, docID in get_words_pair(documents): all_words_counter += 1 if word not in words_dict: words_dict[word] = Term([docID]) # postings is a list of docID elif words_dict[word].postings[-1] != docID: words_dict[word].postings.append(docID) words_dict[word].freq += 1 return WordCollectionInfo(SortedDict(words_dict), documents, all_words_counter)
def get_incidence_matrix(documents) -> WordCollectionInfo: vocabulary = dict() postings = list() all_words_counter = 0 unique_word_id = 0 doc_count = len(documents) for word, doc_id in get_words(documents): unique_word_id = add_word(word, unique_word_id, doc_id, vocabulary, postings, doc_count) all_words_counter += 1 return WordCollectionInfo(SortedDict(vocabulary), postings, documents, all_words_counter)
def get_coordinate_index(documents) -> WordCollectionInfo: words_dict = {} all_words_counter = 0 for word,pos, docID in get_positioned_words(documents): all_words_counter += 1 if word not in words_dict: words_dict[word] = Term() #postings is [ dictionary key-docID, value - list of positions in document ] if docID not in words_dict[word].postings: words_dict[word].postings[docID] = {"frequncy":1, "positions":[pos]} else: words_dict[word].postings[docID]["frequency"]+=1 words_dict[word].postings[docID]["positions"].append(pos) words_dict[word].freq += 1 return WordCollectionInfo(SortedDict(words_dict), documents, all_words_counter)
def get_coordinate_index(documents) -> WordCollectionInfo: vocabulary = dict() postings = list() all_words_counter = 0 unique_word_id = 0 for word, pos, doc_id in get_positioned_words(documents): if word not in vocabulary: word_id = unique_word_id vocabulary[word] = {"id": word_id, "frequency": 0} postings.append(dict()) unique_word_id += 1 vocabulary[word]["frequency"] += 1 word_id = vocabulary[word]["id"] if doc_id not in postings[word_id]: postings[word_id][doc_id] = [pos] else: postings[word_id][doc_id].append(pos) all_words_counter += 1 return WordCollectionInfo(SortedDict(vocabulary), postings, documents, all_words_counter)