def main(query_file, document_list, stopwordfile): # Load the document list - one file per line with open(document_list, 'r') as f: print(f"Loading document list from {document_list}") doclist = f.read().split() # Construct the vocabulary vocabulary, document_tokens = build_vocabulary(doclist, stopwordfile) # Build the inverse index inverse_index = build_inverse_index(document_tokens, vocabulary) # Computer the inverse doc frequencies idf = calc_idf(inverse_index, len(doclist)) # Load the query with open(query_file, 'r') as f: query = f.read() # stop and stem the query query_text = stop_remover(query, stopwordfile) query_tokens = stemmer(query_text) # compute the word frequencies in the query query_wf = dict() for term in set(query_tokens): query_wf[term] = query_tokens.count(term) print(f"\nQuery word frequencies") print(query_wf) # Now we compute the similarity with each document print("\nComputing similarity of query with documents") for doc_number in range(len(doclist)): similarity = sim(query_wf, doc_number, inverse_index, idf) print(f"sim(q,d{doc_number}) = {similarity:.2f}") return None
def searchPage( url, searchTags): try: tokens = stemmer.stemmer(tokenizer.contentTokenizer(url)) print "Calculating priority for : " + url token_set = set(tokens) tf = {} queryTf = {} for word in token_set: tf[word] = float(tokens.count(word)) / len(tokens) #searchTags = stemmer.stemmer(tokenizer.stringTokenizer(string)) for tag in searchTags: if tf.has_key(tag): queryTf[tag] = tf.get(tag) else: queryTf[tag] = 0 priority = calcPriority(queryTf) except: print "Priority calculation error" #print "Priority for " + url + " - " + priority return priority #print queryCount #print searchPage(url, "Big Bang theory cast sheldon") #print searchPage(url, "Big armadillo hello lol this") #print searchPage(url, "hello")
def search(path_to_index, queries): a = os.path.join(path_to_index, 'inverted_index') from stemmer import stemmer from stopwords import stopwords from search_handler import search stemmer = stemmer() stopwords = stopwords() search = search(a, stemmer, stopwords) outputs = [] for query in queries: the_result = search.search(query) if the_result is None: outputs.append(['.'] * 10) else: outputs.append(the_result) return outputs
def searchPage(url, searchTags): try: tokens = stemmer.stemmer(tokenizer.contentTokenizer(url)) print "Calculating priority for : " + url token_set = set(tokens) tf = {} queryTf = {} for word in token_set: tf[word] = float(tokens.count(word)) / len(tokens) #searchTags = stemmer.stemmer(tokenizer.stringTokenizer(string)) for tag in searchTags: if tf.has_key(tag): queryTf[tag] = tf.get(tag) else: queryTf[tag] = 0 priority = calcPriority(queryTf) except: print "Priority calculation error" #print "Priority for " + url + " - " + priority return priority
def build_vocabulary(doclist, stopwordfile): # Create an empty set for the vocabulary print("Building the vocabulary") vocabulary = set() document_tokens = [] for doc in doclist: print(f"Loading document {doc}") with open(doc, 'r') as f: text = f.read() # Remove stop words text = stop_remover(text, stopwordfile) # stem tokens = stemmer(text) document_tokens.append(tokens) unique_tokens = set(tokens) print(f"{len(unique_tokens)} unique tokens") # Add the tokens to the vocab if they are not already there. vocabulary = vocabulary.union(unique_tokens) print(f"There are now {len(vocabulary)} words in the vocabulary\n") return vocabulary, document_tokens
def loginFormHandling(): from stemmer import stemmer from stopwords import stopwords from search_handler import search stemmer = stemmer() stopwords = stopwords() search = search("path", stemmer, stopwords) data = request.form query = request.form['query'] flag = 0 if data['title'] != '': flag = 1 query += " title:" query += data['title'] if data['infobox'] != '': flag = 1 query += " infobox:" query += data['infobox'] if data['references'] != '': flag = 1 query += " ref:" query += data['references'] if data['category'] != '': flag = 1 query += " category:" query += data['category'] if data['links'] != '': flag = 1 query += " links:" query += data['links'] if data['body'] != '': flag = 1 query += " body:" query += data['body'] # if (flag == 1): # r = len(request.form ['query']) # query = query[r:] print(query) the_result = search.search(query) return render_template('results.html', query=query, results=the_result)
return count # Extensions to be avoided excludedExtentions = [ '.png', '.gif', '.jpg', '.jpeg', '.pdf', '.mp3', '.wmv', '.svg', '.ogg', '.jsp', '.ogv', '.py', '.tar.gz', '.css', '.ico', '.gz', '.ppt', '.zip', '.rar', '.ps', '.ppsx' ] excludedExtensions = set( excludedExtentions) # Making set for easy membership test errorcount = 0 query = sys.argv[1] # Initial query for focused crawler urlcount = int(sys.argv[2]) searchTags = stemmer.stemmer(tokenizer.stringTokenizer( query)) # Stemming and tokenizing search query to avoid duplicate effort print "Search tokens : " print searchTags top10urls = gQuery.googleSearch(query) urls = MyPriorityQueue() # Queue for storing URLs yet to be visisted results = {} # Dictionary structure for storing results URLs visited = set() for i in range(len(top10urls)): # Adding top 10 results to Queue urls.put(top10urls[i], 1) br = mechanize.Browser() f = open('results.txt', 'w+')
import pickle, os, sys import time start = time.time() from stemmer import stemmer from stopwords import stopwords from parser import parser stemmer = stemmer() stopwords = stopwords() path_to_dump = sys.argv[1] path_to_index_folder = sys.argv[2] print (path_to_index_folder) wikipedia_parser = parser (path_to_index_folder, stemmer, stopwords) wikipedia_parser.parse (path_to_dump) wikipedia_parser.make_alphabet_pairs () # wikipedia_parser.merge_files () # wikipedia_parser.divide_files () end = time.time() print ("It took", end-start, "seconds")
def urlContains(url, searchTags): count = 0 for tag in searchTags: if tag in url: count +=1 return count # Extensions to be avoided excludedExtentions = ['.png', '.gif', '.jpg', '.jpeg', '.pdf', '.mp3', '.wmv', '.svg', '.ogg','.jsp', '.ogv', '.py', '.tar.gz', '.css', '.ico', '.gz', '.ppt', '.zip', '.rar', '.ps', '.ppsx'] excludedExtensions = set(excludedExtentions) # Making set for easy membership test errorcount = 0 query = sys.argv[1] # Initial query for focused crawler urlcount = int(sys.argv[2]) searchTags = stemmer.stemmer(tokenizer.stringTokenizer(query)) # Stemming and tokenizing search query to avoid duplicate effort print "Search tokens : " print searchTags top10urls = gQuery.googleSearch(query) urls = MyPriorityQueue() # Queue for storing URLs yet to be visisted results = {} # Dictionary structure for storing results URLs visited = set() for i in range(len(top10urls)): # Adding top 10 results to Queue urls.put(top10urls[i],1) br = mechanize.Browser() f = open('results.txt','w+')