def load_nfull_collection(dir,
                          vocab_json='vocab.json',
                          postings_json='postings.json'):
    with open(dir + '/' + vocab_json, 'r') as vocab_file:
        vocabulary = json.load(vocab_file)
    with open(dir + '/' + postings_json, 'r') as postings_file:
        postings = json.load(postings_file)
    return WordCollectionInfo(vocabulary, postings)
def get_biword_index(documents) -> WordCollectionInfo:
    vocabulary = dict()
    postings = list()
    all_words_counter = 0
    unique_word_id = 0
    for word_pair, doc_id in get_words_pair(documents):
        unique_word_id = add_word(word_pair, unique_word_id, doc_id,
                                  vocabulary, postings)
        all_words_counter += 1
    return WordCollectionInfo(SortedDict(vocabulary), postings, documents,
                              all_words_counter)
def load_collection(dir,
                    vocab_json='vocabSER.txt',
                    postings_json='postingsSER.txt',
                    documents_json='documentsSER.txt'):
    with open(dir + '/' + vocab_json, 'rb') as vocab_file:
        vocabulary = pickle.load(vocab_file)
    with open(dir + '/' + postings_json, 'rb') as postings_file:
        postings = pickle.load(postings_file)
    with open(dir + '/' + documents_json, 'rb') as vocab_file:
        documents = pickle.load(vocab_file)
    return WordCollectionInfo(vocabulary, postings, documents)
예제 #4
0
def get_biword_index(documents) -> WordCollectionInfo:
    words_dict = {}
    all_words_counter = 0
    for word, docID in get_words_pair(documents):
        all_words_counter += 1
        if word not in words_dict:
            words_dict[word] = Term([docID])  # postings is a list of docID
        elif words_dict[word].postings[-1] != docID:
            words_dict[word].postings.append(docID)
        words_dict[word].freq += 1
    return WordCollectionInfo(SortedDict(words_dict), documents, all_words_counter)
예제 #5
0
def get_incidence_matrix(documents) -> WordCollectionInfo:
    vocabulary = dict()
    postings = list()
    all_words_counter = 0
    unique_word_id = 0
    doc_count = len(documents)
    for word, doc_id in get_words(documents):
        unique_word_id = add_word(word, unique_word_id, doc_id, vocabulary,
                                  postings, doc_count)
        all_words_counter += 1
    return WordCollectionInfo(SortedDict(vocabulary), postings, documents,
                              all_words_counter)
예제 #6
0
def get_coordinate_index(documents) -> WordCollectionInfo:
    words_dict = {}
    all_words_counter = 0
    for word,pos, docID in get_positioned_words(documents):
        all_words_counter += 1
        if word not in words_dict:
            words_dict[word] = Term() #postings  is [ dictionary key-docID, value - list of positions in document ]
        if docID not in words_dict[word].postings:
            words_dict[word].postings[docID] = {"frequncy":1, "positions":[pos]}
        else:
            words_dict[word].postings[docID]["frequency"]+=1
            words_dict[word].postings[docID]["positions"].append(pos)
    words_dict[word].freq += 1
    return WordCollectionInfo(SortedDict(words_dict), documents, all_words_counter)
def get_coordinate_index(documents) -> WordCollectionInfo:
    vocabulary = dict()
    postings = list()
    all_words_counter = 0
    unique_word_id = 0
    for word, pos, doc_id in get_positioned_words(documents):
        if word not in vocabulary:
            word_id = unique_word_id
            vocabulary[word] = {"id": word_id, "frequency": 0}
            postings.append(dict())
            unique_word_id += 1
        vocabulary[word]["frequency"] += 1
        word_id = vocabulary[word]["id"]
        if doc_id not in postings[word_id]:
            postings[word_id][doc_id] = [pos]
        else:
            postings[word_id][doc_id].append(pos)
        all_words_counter += 1
    return WordCollectionInfo(SortedDict(vocabulary), postings, documents,
                              all_words_counter)