def test_build_index2(mock_file): """ Check if we can build index from string with repeated word. """ my_index = InvertedIndex() my_index.build(mock_file) assert my_index.index_data == {'test': [1], 'me': [1], 'first': [1]}
def test_add_new_doc_one_word(): doc_id = 1 content = 'foo' foo_index = InvertedIndex() foo_index.add_new_document(doc_id, content) assert doc_id in foo_index.inverted_index[content], ( "add a new document with 1 word but couldn't find in built index")
def test_query2(mock_file): """ Check if we can query some correct words with repeats. """ my_index = InvertedIndex() my_index.build(mock_file) assert my_index.query(['string', 'long']) == {1, 10, 100500}
def test_get_index_data(self): data_source = DataSource(self.books_file) inverted_index = InvertedIndex(self.index_file) inverted_index.get_inverted_index(data_source.read_file()) with open(inverted_index.file_name, 'r') as f: keywords_dict = pickle.load(f) self.assertEqual(keywords_dict[self.keyword], ['B000UZNREG'])
class IndexTrainer(object): def __init__(self): self.index = InvertedIndex() self.bow = Bow() self.extractor = Extractor('surf') print self.index.author print self.index.description def load_feature(self, path='../models/feature.npy'): self.features = np.load(path) if len(self.features) > 200000: self.features = self.features[:200000] print "feature shape: ", self.features.shape return self.features def run(self, path): self.bow.load() self.index.reset(self.bow.centers) images = imutil.get_list_image(path) t = imutil.Timer(1) t.tic() for i,image in enumerate(images): descriptors = self.extractor.extract(image) self.index.append(image, descriptors) if (i+1)%1000 == 0: t.toc('finish 1000 images: ') t.tic()
def test_query_2_intersect_words(words=['bow', 'tfidf']): inv_idx = InvertedIndex(TEST_INDEX_TABLE) doc_ids = inv_idx.query(words) right_answer = { 4, } assert doc_ids == right_answer
def test_many_queries_not_in_one_article(query): index = InvertedIndex() article_id = '12' with open(ONE_ARTICLE_PATH, 'r') as fd: index.build(fd) assert article_id not in index.find_articles( query), 'find article in query that not in one article'
def test_build_index(mock_file): """ Check if we can build index from simple string. """ my_index = InvertedIndex() my_index.build(mock_file) assert my_index.index_data == {'test': [1], 'me': [1], 'first': [1]}
def test_single_element(self): c = [('http://test.net', 'Simple text')] ii = II(c) ii.create_index() d = dd(list) d['simple'] = ['http://test.net'] d['text'] = ['http://test.net'] assert d == ii.index
def test_build_index4(mock_file): """ Check if we can build index from file with several lines. """ my_index = InvertedIndex() my_index.build(mock_file) # Check the index assert len(my_index.index_data) == 12 assert len(my_index.index_data.values()) == 12
def test_query_from_loaded2(mock_file): """ Check if we can query some non-existed words. """ my_index = InvertedIndex() my_index.load('test.index') assert my_index.query(['me', 'test', 'non existed']) == set() assert len(mock_file.mock_calls) > 1
def test_load_index(mock_file): """ Check if we can load index. """ my_index = InvertedIndex() my_index.load('my_Test.index') assert my_index.index_data == {'me': [1], 'test': [1, 2], 'you': [2]} assert len(mock_file.mock_calls) > 1
def main(): os.chdir(RELATIVE_PATH_TO_CORPUS) docs = os.listdir(os.getcwd()) index = InvertedIndex(docs) index.build() QueryHandler(index).loop()
def test_query(mock_file): """ Check if we can query some correct words. """ my_index = InvertedIndex() my_index.build(mock_file) assert my_index.query(['test']) == { 1, }
def test_add_new_doc_multi_word(): doc_id = 23 word_1 = ' foo ' word_2 = ' \t bar\t' foo_index = InvertedIndex() foo_index.add_new_document(doc_id, word_1 + word_2) assert_mes = "add a new document with 2 words and different separators but couldn't find a word in built index" assert doc_id in foo_index.inverted_index[word_1.strip()], assert_mes assert doc_id in foo_index.inverted_index[word_2.strip()], assert_mes
def test_query_one_doc_in_index(): index = InvertedIndex() index.inverted_index = defaultdict(set, { 'foo': {1, 2, 3}, 'bar': {1}, 'foobar': {1, 2} }) assert index.query(['foo', 'bar' ]) == {1}, 'didnt find a doc, which present in index'
def test_one_article(): index = InvertedIndex() with open(ONE_ARTICLE_PATH, 'r') as fd: index.build(fd) with open(ONE_ARTICLE_PATH, 'r') as fd: article_id, words = fd.readline().split(maxsplit=1) words = words.split() assert article_id == index.find_articles( words), 'didnt find article in query of all words in article'
def __init__(self, config): # An object representing the inverted_index: {term: [df, {tweet_id: list of tweet information...}...]..} self.inverted_idx = InvertedIndex() # Represents the GloVe vector self.document_dict = {} self.num_of_docs = 0 self.global_capitals = {} self.entities_dict = Counter() self.config = config self.glove_dict = {}
def test_query_from_loaded(mock_file): """ Check if we can query after we load the index. """ my_index = InvertedIndex() my_index.load('test.index') assert my_index.query(['me', 'test']) == { 1, } assert len(mock_file.mock_calls) > 1
def test_multiple_elements(self): c = [('One', 'one Two three'), ('two', 'three'), ('three', 'two Three')] d = dd(list) d['one'] = ['One'] d['two'] = ['One', 'three'] d['three'] = ['One', 'two', 'three'] ii = II(c) ii.create_index() assert d == ii.index
def exec(): #Querry comes from command line querry_ = sys.argv[1:] ii = InvertedIndex() for file in [f for f in listdir('../data') if isfile(join('../data', f))]: document_ = Document('../data/'+file) ii.add_document(document_) sim_table = sorted(ii.querry(querry_)) for document in sim_table: print(document)
def test_unicode_query_two_docs_in_index(): index = InvertedIndex() index.inverted_index = defaultdict(set, { 'один': {1, 2, 3}, 'bar': {1}, 'два': {1, 2} }) assert index.query(['один', 'два']) == { 1, 2 }, 'didnt find a two docs, which are present in index with unicode'
def test_inv_index_query(load_inverted_index, wiki_docs): words = ['after', 'were'] doc_ind = InvertedIndex.query(load_inverted_index, words) assert {25, 290}.issubset(doc_ind) words = ['neizvesnie', 'slova'] doc_ind = InvertedIndex.query(load_inverted_index, words) assert len(doc_ind) == 0 words = ['after', 'were'] index_dict = build_inverted_index(wiki_docs) inv_index = InvertedIndex(index_dict) doc_ind = inv_index.query(words) assert {25, 290}.issubset(doc_ind)
def test_build_index3(mock_file): """ Check if we can build index from string with special characters. """ my_index = InvertedIndex() my_index.build(mock_file) assert my_index.index_data == { 'test': [1], 'test,': [1], 'me': [1], 'first': [1] }
def test_check_compression_good(): json_inverted_index = InvertedIndex.load( "inverted_index/inverted_json.index", JsonStoragePolicy()) compressed_inverted_index = InvertedIndex.load( "inverted_index/inverted_json_zip.index", JsonZipStoragePolicy()) assert json_inverted_index.query(["two", "words" ]) == compressed_inverted_index.query([ "two", "words" ]), "compressin give another answer" assert json_inverted_index.get_size( ) == compressed_inverted_index.get_size( ), "compressed file has diff num of records"
def test_parse_queries_three_query(): expected = [['foo', 'bar'], ['one', 'два', '123'], ['один']] arguments = Namespace(query=expected, query_file_utf8=None, query_file_cp1251=None) result = InvertedIndex().parse_queries(arguments) assert result == expected, 'wrong parsing query file with three queries'
def test_parse_queries_two_query(): expected = [['foo', 'bar'], ['one', 'два', '123']] with open('test_two_queries.cp1251', 'r', encoding='cp1251') as f: arguments = Namespace(query=None, query_file_utf8=None, query_file_cp1251=f) result = InvertedIndex().parse_queries(arguments) assert result == expected, 'wrong parsing query file with two queries'
def test_parse_queries_one_query(): expected = [['one', 'два', '123']] with open('test_one_query.utf8', 'r') as f: arguments = Namespace(query=None, query_file_utf8=f, query_file_cp1251=None) result = InvertedIndex().parse_queries(arguments) assert result == expected, 'wrong parsing query file with one query'
def search(dictionary_file, postings_file, query_file, output_file): try: # Remove previous output file os.remove(output_file) except OSError: pass inverted_index = InvertedIndex(dictionary_file, postings_file) meta_data = get_meta_data() tree = ET.parse(query_file) root = tree.getroot() title_tokens = [] description_tokens = [] raw_tokens = [] for child in root: if child.tag == 'title': title_tokens = build_tokens(child.text) raw_tokens.extend(word_tokenize(child.text)) elif child.tag == 'description': description_tokens = build_tokens(child.text) raw_tokens.extend(word_tokenize(child.text)) raw_tokens = helper.remove_stop_words_without_normalize( helper.filter_invalid_characters(raw_tokens)) additional_tokens = [] for token in list(set(raw_tokens)): additional_tokens.extend(helper.get_similar_words(token)) pass title_tokens = helper.remove_stop_words( helper.filter_invalid_characters(title_tokens)) description_tokens = helper.remove_stop_words( helper.filter_invalid_characters(description_tokens)) # tight results are results which favour high precision. We use this as a proxy for true positive tight_results = execute_query(title_tokens, description_tokens, [], inverted_index, meta_data) global top_UPC_classes global top_IPC_classes global top_family_members top_UPC_classes = get_top_classes(tight_results, meta_data['UPC_class'], 6) top_IPC_classes = get_top_classes(tight_results, meta_data['IPC_class'], 4) top_family_members = get_top_members(tight_results, meta_data['family_members'], 30) # supplementary_results = expand_query(tight_results, meta_data['doc_top_terms'], inverted_index, meta_data) additional_tokens = helper.normalize_tokens(list(set(additional_tokens))) results = execute_query(title_tokens, description_tokens, additional_tokens, inverted_index, meta_data) k = int(TOP_X_PERCENT_RESULTS * len(results)) # j = int(TOP_X_PERCENT_RESULTS * len(supplementary_results)) # results = list(set(results[:k] + supplementary_results[:j])) write_to_output(output_file, results[:k])
def test_dump_and_load_index(tmp_path, tiny_sample_document): dir = tmp_path / "tiny_example_dir" dir.mkdir() index_file = dir / "tiny_example.index" docs = tiny_sample_document inv_table = build_inverted_index(docs) inv_table.dump(index_file) assert inv_table == TINY_SAMPLE_INV_TABLE loaded_inv_table = InvertedIndex.load(index_file) assert inv_table == loaded_inv_table
def test_can_dump_and_load_inverted_index(tmpdir, small_dataset_index): index_fio = tmpdir.join('inverted.index') small_dataset_index.dump(index_fio) load_inverted_index = InvertedIndex.load(index_fio) assert small_dataset_index == load_inverted_index, ( "load should return the same inverted index" ) assert {} != load_inverted_index, ( "load should return the same inverted index" )
def init_inverted_index(): idx = 1 doc_list = [] inverted_index = InvertedIndex() while True: try: document = deserialize(str(idx)+".dbf") doc_list.append(document) idx += 1; except IOError: break total = len(doc_list) inverted_index.n = total for document in doc_list: lower_doc = str(document).lower() tokens = nltk.word_tokenize(lower_doc) for pos in range(0,len(tokens)): tk = tokens[pos] if not tk in inverted_index: inverted_index[tk] = list() term_data = inverted_index[tk] if not document.id in map(lambda p: p.doc_id, term_data): term_data.append(Posting(document.id)) for posting in term_data: if posting.doc_id == document.id: posting.positions.append(pos) break print "{0:.2f}% completed...".format(float(document.id)/total * 100) serialize(inverted_index,"inverted_index.idx")
class IndexTrainer(object): def __init__(self, centers): self.bow = Bow(centers) self.index = InvertedIndex() def load_feature(self, path='../models/feature.npy'): self.features = np.load(path) if len(self.features) > 500000: self.features = self.features[:500000] print "feature shape: ", self.features.shape return self.features def train(self): self.bow.load('../models/bow.pkl') self.index.reset(self.bow.centers) self.index.append('img1',self.features[:100]) self.index.append('img2',self.features[100:200]) self.index.append('img3',self.features[200:300]) print self.index
def __init__(self, n_terms): InvertedIndex.__init__(self, n_terms)
def __init__(self, centers): self.bow = Bow(centers) self.index = InvertedIndex()
''' Created on 2014-11-27 @author: haoyu ''' from inverted_index import InvertedIndex from searcher import Searcher from tfidf import TF_IDF from bm25 import BM25 if __name__ == '__main__': fdir = "data" rankerUse = "TF-IDF" rankerAvailable = {"TF-IDF":TF_IDF, "BM25":BM25} invertedFile = InvertedIndex() invertedFile.makeTextIndexFromFloder( fdir ) ranker = rankerAvailable[rankerUse]( invertedFile ) searcher = Searcher( invertedFile, ranker ) # print(invertedFile.times) # print( invertedFile.contains ) while True: searcher.search()
def __init__(self): self.index = InvertedIndex() self.bow = Bow() self.extractor = Extractor('surf') print self.index.author print self.index.description