def __init__(self): self.table_name = "InvertedIndex" self.db = DBProcess('root', '19920102', 'Web2', 'InvertedIndex') # 链接数据库 self.inverted_ob = InvertedIndex() self.inverted_index = self.inverted_ob.create_inverted_index( ) # 创建倒排索引
def main(create_index=False): import time import os from DocCollection import DocCollection from InvertedIndex import InvertedIndex db = DocCollection() index = InvertedIndex(db) if create_index: index_corpus(index) else: try: start = time.time() index.load_index_from_file('./indexer/inverted_index.txt') end = time.time() print('\033[1;36;40m Load Index time: \033[0;0m', end - start) except FileNotFoundError: print('Must create and save Inverted Index first!') user_input = input( 'Do you want to create Inverted Index now? y/n\n') if user_input.lower() == 'y': print('Indexing Documents . . .') index_corpus(index) else: exit(0) while (True): query_collection(index, db, get_docs=False)
def searchengine(directory): stopWords = set(stopwords.words("english")) # stemming ps = PorterStemmer() # create InvertedIndex obj invertedIndex = InvertedIndex() # build the corpus Corp = Corpus() corpus = Corp.buildCorpus(directory) for docId in corpus: doc = corpus[docId] content = doc.getContent() # tokenize tokens = word_tokenize(content) for token in tokens: token = token.lower() # apply stemming token = ps.stem(token) # remove stopwords if token in stopWords: continue # add to index invertedIndex.addTerm(token, docId) return invertedIndex, corpus
def process(url, soup): # Helper function from http://stackoverflow.com/questions/1936466/beautifulsoup-grab-visible-webpage-text def visible(element): if element.parent.name in [ 'style', 'script', '[document]', 'head', 'title' ]: return False elif re.match('<!--.*-->', str(element)): return False return True visible_texts = filter(visible, soup.findAll(text=True)) text = ' '.join(visible_texts) e = models.Entity( # Hack to retrieve proper name from wikipedia pages name=soup.title.string.split(" - Wikipedia")[0], url=url, image_url=None, text=text, classification=None) # Write objects to the database. Commiting each object by itself # is inefficient, but who cares db.session.add(e) db.session.commit() # process inverted index ? InvertedIndex.build_inverted_index(text, url)
def main(args): print("Initializing...") documents = DocumentIndex(args.file) index = InvertedIndex() index.initialize(documents.index) query_manager = QueryManager(index) cli = CLI(documents, query_manager) print("Done") cli.initialize()
class BooleanRetrieval(): def __init__(self): self.ii = InvertedIndex() self.pi = PositionalIndex() def userInterface(self): op = '1' while(op!='0'): print("Boolean Information Retrieval") print("-----------------------------") print("1. Positional Index") print("2. Inverted Index") print("0. Exit") op = input("Enter input: ") self.inputQuery(op) def inputQuery(self, op): if op == '1': query = input("Enter query: ") self.pi.loadDocuments() self.pi.buildDictionary() qb = QueryBuilder(query, positionalIndex = self.pi) output = qb.executeQuery() print(output) if len(qb.transformQuery())==1: self.printOutput(query) print("\n") elif op == '2': query = input("Enter boolean query: ") self.ii.loadDocuments() self.ii.buildDictionary() qb = QueryBuilder(query, invertedIndex = self.ii) output = qb.executeQuery() print(output) print("\n") else: return def printOutput(self, query): print("query: ", query) dictionary = self.pi.getPositionalIndex(query) i=0 for k,v in dictionary.items(): print("doc "+str(k)+":", v) i=i+1 print("/n DOCUMENTS RETURNED/n" + ":",i)
def main(): #All Crawling here! crawlSpider() invIndex = InvertedIndex() invIndex.loadPickles() queryObj = Query() os.system("clear") #Infinite loop while (1): print "" print "" print "" # invIndex.createTermFrequencyMatrix() queryObj.query = raw_input("Please enter a query for zackSpider: ") print "Your query is:", queryObj.query returnDocs = queryObj.parseQuery(queryObj.query, invIndex.inverted_index) if (returnDocs > 0): returnedDocs = sorted(returnDocs.items(), key=itemgetter(1), reverse=True) os.system("clear") print "" print "" print "" print "The following documents are ranked from highest to lowest similarity for your query: " print "---------------------------------------------------------------------------------------" print "{:<5} {:<15} {:<55} {:<10}".format('Doc', 'Similarity', 'Url', 'Preview') for key in returnedDocs: docKey = key[0] - 1 doc = invIndex.collections_index[docKey] sim = key[1] print "{:<5} {:<15.10f} {:<55} {:<10}".format( docKey, sim, doc[0], doc[1]) print "" print "" else: print "No results." print "" print ""
class DBuildInvertedIndex(object): def __init__(self): self.table_name = "InvertedIndex" self.db = DBProcess('root', '19920102', 'Web2', 'InvertedIndex') # 链接数据库 self.inverted_ob = InvertedIndex() self.inverted_index = self.inverted_ob.create_inverted_index( ) # 创建倒排索引 def write_index_to_db(self): count = 1 for item in self.inverted_index: url_set = "\n".join(self.inverted_index[item][0:]) print("INSERT INTO " + self.table_name + " VALUE(NULL, \"" + item + "\",\"" + url_set + "\")") try: self.db.insert_data("INSERT INTO " + self.table_name + " VALUE(NULL ,\"" + item + "\",\"" + url_set + "\")") print("insert success! %d" % count) count += 1 except pymysql.err.ProgrammingError as e: print("INSERT INTO " + self.table_name + " VALUE(NULL, \"" + item + "\",\"" + url_set + "\")") print(e) self.db.close_db()
def main(): #All Crawling here! crawlSpider(); invIndex = InvertedIndex() invIndex.loadPickles() queryObj = Query() os.system("clear") #Infinite loop while(1): print"" print"" print"" # invIndex.createTermFrequencyMatrix() queryObj.query = raw_input("Please enter a query for zackSpider: ") print "Your query is:", queryObj.query returnDocs = queryObj.parseQuery(queryObj.query, invIndex.inverted_index) if (returnDocs > 0): returnedDocs = sorted(returnDocs.items(),key=itemgetter(1), reverse=True) os.system("clear") print"" print"" print"" print "The following documents are ranked from highest to lowest similarity for your query: " print"---------------------------------------------------------------------------------------" print "{:<5} {:<15} {:<55} {:<10}".format('Doc', 'Similarity', 'Url','Preview') for key in returnedDocs: docKey = key[0]-1 doc = invIndex.collections_index[docKey] sim = key[1] print "{:<5} {:<15.10f} {:<55} {:<10}".format(docKey, sim, doc[0], doc[1]) print"" print"" else: print "No results." print"" print""
def result(): if request.method == "POST": word = request.form.get("Search") res, count_assignment, count_comparator = InvertedIndex(word) return render_template("result.html", result=res, count_ass=count_assignment, count_com=count_comparator)
def get_results(): query = request.args.get('query') urls = InvertedIndex.get_rank(query) #urls = (('google.com', 3), ('facebook.com', 1)) res = [] for (url, rank) in urls: res.append({'url': url, 'rank': rank}) return jsonify({'results': res, 'time_ms': 30})
def main(docs_path: str, query_path: str): docs_files = os.scandir(docs_path) inv_idx = InvertedIndex() print(f"Indexing {docs_path}...") for file in docs_files: try: inv_idx.parse_file(file.path) except Exception as e: print(f"Error at {file.name}:\n\t{e}") raise e inv_idx.finalize() print("Inverted Index is built.") print("Fetching queries...") with open(query_path, 'r') as f: with open("Task_2.txt", 'w') as o: o.write('\n'.join([BooleanRetrieval(inv_idx, q) for q in f])) print("Done querying.") print("Collection statistics...") with open("Task_3.txt", 'w') as f: terms = list(inv_idx.index.keys()) f.write(f"Terms with highest document frequency:\n{terms[:10]}\n") f.write(f"Terms with lowest document frequency:\n{terms[-10:]}") print("Done.")
class SearchEngine(object): inverted_index = InvertedIndex() def buildTries(self): f = open(MAP_PATH, "r") file_link_map = json.loads(f.readlines()[0]) # filter stopword and punctuation stop_words = set(stopwords.words('english')) stop_words.update(string.punctuation) for filename in file_link_map: f = open(filename) soup = BeautifulSoup(f.read(), 'html.parser') [script.extract() for script in soup.findAll('script')] [style.extract() for style in soup.findAll('style')] # print file_link_map[filename] words = word_tokenize(soup.get_text()) # remove the words containing punctuation words = [i for i in words if all(j not in string.punctuation for j in i)] for word in words: if word.lower() not in stop_words and len(word) > MINIMUM_CHR and word.isdigit()==False: # build compressed trie tree try: # remove the words whcih can't encode to ascII word = word.lower().strip().encode('ascII') except: # print word a = 1 else: self.inverted_index.put(word, file_link_map[filename]) # print word.lower().strip() f.close() def search(self, key): if self.inverted_index.get(key) == None: return {} else: return self.inverted_index.get(key) def getRecomendKey(self, string): return self.inverted_index.getRecommendKey(string)
class DBuildInvertedIndex(object): def __init__(self): self.table_name = "InvertedIndex" self.db = DBProcess('root', '19920102', 'Web2', 'InvertedIndex') # 链接数据库 self.inverted_ob = InvertedIndex() self.inverted_index = self.inverted_ob.create_inverted_index() # 创建倒排索引 def write_index_to_db(self): count = 1 for item in self.inverted_index: url_set = "\n".join(self.inverted_index[item][0:]) print("INSERT INTO " + self.table_name + " VALUE(NULL, \"" + item + "\",\"" + url_set + "\")") try: self.db.insert_data("INSERT INTO "+self.table_name+" VALUE(NULL ,\""+item+"\",\""+url_set+"\")") print("insert success! %d" % count) count += 1 except pymysql.err.ProgrammingError as e: print("INSERT INTO " + self.table_name + " VALUE(NULL, \"" + item + "\",\"" + url_set + "\")") print(e) self.db.close_db()
def main(): parser = argparse.ArgumentParser( description='An example script to check th index') parser.add_argument("-t", type=str, required=True, help="The term to extract") args = parser.parse_args() index = InvertedIndex() documents = read_file("cran-1400.txt") print("Initializing index...") index.initialize(documents) print("Lets check the number of indexed terms:") print(str(len(index.terms.keys()))) print("Checking how many documents contain the word " + args.t) try: print(len(index.get_post_list(args.t))) print("The exact documents are:") for doc_id in index.get_post_list(args.t).keys(): print(doc_id) except: print("No entry for that term") exit(1)
def __init__(self): self.ii = InvertedIndex() self.pi = PositionalIndex()
def load_inverted_index_in_memory(self, collection_stats_file, docs_meta_file, lookup_table_file, inverted_lists_file, compressed): """ Loads an inverted index in memory, inverted lists are not loaded by default buffer collection_stats_file: Buffer for the collection stats file buffer docs_meta_file: Buffer for the docs meta file buffer lookup_table_file: Buffer for the lookup table file buffer inverted_lists_file: Buffer for the inverted lists file """ inverted_index = InvertedIndex(self.config, compressed) # Load collection statistics collection_stats = json.load(collection_stats_file) inverted_index.load_collection_stats(collection_stats) # Load meta info for documents docs_meta = json.load(docs_meta_file) inverted_index.load_docs_meta(docs_meta) # Load lookup table lookup_table = json.load(lookup_table_file) inverted_index.load_lookup_table(lookup_table) # Load vocabulary inverted_index.load_vocabulary() # Load inverted lists only if in_memory is True if self.config.in_memory: index_map = defaultdict(InvertedList) for term, term_stats in lookup_table.items(): inverted_list = index_map[term] inverted_list_binary = inverted_index.read_inverted_list_from_file( inverted_lists_file, term_stats['posting_list_position'], term_stats['posting_list_size']) inverted_list.bytearray_to_postings(inverted_list_binary, compressed, term_stats['df']) inverted_index.load_map(index_map) return inverted_index
# -*- coding:utf-8 -*- __author__ = 'Kusamura' from Data import Data from InvertedIndex import InvertedIndex from Search import Search from Frequency import Frequency if __name__ == '__main__': fileList = [] #章の一覧 for line in open('chaps/chap_title.tsv', 'r'): #chaps_titleから章の一覧を生成 fileList.append(line[:-1].split('\t')[0][2:]) dataList = [] #Dataクラス(ファイルから読み込んだデータ)のリスト for fileName in fileList: dataList.append(Data(fileName)) index = InvertedIndex(dataList) #転置インデックスを作成 # module = Search(index) #検索モジュール # keys = ['retrieval', 'half-a-trillion', 'thus', 'layer', 'test', 'hoge', 'hogehoge'] # for key in keys: # module.do(key) # print index.countKeys() #辞書内の語数をカウント # print Frequency().docFrequency(index) #辞書頻度 print Frequency().termFrequency(dataList) #単語頻度
def create_inverted_index(self, compressed): """ Creates and returns an inverted index bool compressed: Flag to choose between a compressed / uncompressed index """ inverted_index = InvertedIndex(self.config, compressed) data = self.load_data() doc_id = -1 for scene in data['corpus']: doc_id += 1 scene_text = scene['text'] # Filter None removes empty strings from the list after the split on space terms = list(filter(None, scene_text.split())) doc_meta = { 'playId': scene['playId'], 'sceneId': scene['sceneId'], 'sceneNum': scene['sceneNum'], 'sceneLength': len(terms) } inverted_index.update_docs_meta(doc_id, doc_meta) inverted_index.update_collection_stats( doc_length=doc_meta['sceneLength']) for position, term in enumerate(terms): inverted_index.update_map(term, doc_id, position) inverted_index.update_collection_stats(average_length=True) inverted_index.load_vocabulary() return inverted_index
from bs4 import BeautifulSoup import requests from Trie import CompressedTrie from InvertedIndex import InvertedIndex from nltk.tokenize import RegexpTokenizer from nltk.corpus import stopwords import unicodedata global cache global inv_idx global unused_urls unused_urls = [] cache = CompressedTrie() inv_idx = InvertedIndex() def page_load(url): """ Loads all data from webpage given the url Parameters: string url - webpage url Return: string data - webpage HTML data """ r = requests.get(url) data = r.content return data def strip_accents(text): """ Replace accented letters in word with equivalent for processing in trie Parameters: string text - text to be processed
import time from bs4 import BeautifulSoup from InvertedIndex import InvertedIndex if __name__ == "__main__": """ Default Values of folder path and index-filename """ path = "/home/stark/Projects/vector-space-retrieval-system/data/sample/" index_filename = "indexfile" if len(sys.argv) == 3: path = sys.argv[1] index_filename = sys.argv[2] start_time = time.time() idx = InvertedIndex() for filename in os.listdir(path): print(filename) file = open(path + filename) content = "".join(file.readlines()) file.close() bs_content = BeautifulSoup(content, "html.parser") for doc in bs_content.find_all("doc"): idx.add_doc(doc) idx.assign_postings_list() idx.save() end_time = time.time() print("Total time: {}".format(end_time - start_time)) del idx
def __init__(self, queryString=""): print "Constructing Query Object!" self.invIndex = InvertedIndex() self.tfidf = TFIDF() self.query = queryString
doc_id[0], rank, doc_id[1], "STANDARD", )) rank += 1 file.close() if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--cutoff", type=int, default=10) parser.add_argument("--query", type=str, default="data/topics.51-100") parser.add_argument("--index", type=str, default="./indexfile.idx") parser.add_argument("--dict", type=str, default="./indexfile.dict") parser.add_argument("--output", type=str, default="resultfile") args = parser.parse_args() invidx = InvertedIndex(dictionary_filepath=args.dict, postings_list_filepath=args.index) queryfile = args.query queryList, queryIdMap = get_query_list(queryfile) ranklist = {} for query in queryList: ranklist[query] = invidx.get_ranking_list(query)[:args.cutoff] write_resultfile(args.output, ranklist, queryIdMap) del invidx del ranklist
def __init__(self): self.table_name = "InvertedIndex" self.db = DBProcess('root', '19920102', 'Web2', 'InvertedIndex') # 链接数据库 self.inverted_ob = InvertedIndex() self.inverted_index = self.inverted_ob.create_inverted_index() # 创建倒排索引