Ejemplo n.º 1
0
def main(query_file, document_list, stopwordfile):
    # Load the document list - one file per line
    with open(document_list, 'r') as f:
        print(f"Loading document list from {document_list}")
        doclist = f.read().split()
    # Construct the vocabulary
    vocabulary, document_tokens = build_vocabulary(doclist, stopwordfile)
    # Build the inverse index
    inverse_index = build_inverse_index(document_tokens, vocabulary)
    # Computer the inverse doc frequencies
    idf = calc_idf(inverse_index, len(doclist))
    # Load the query
    with open(query_file, 'r') as f:
        query = f.read()
        # stop and stem the query
        query_text = stop_remover(query, stopwordfile)
        query_tokens = stemmer(query_text)
        # compute the word frequencies in the query
        query_wf = dict()
        for term in set(query_tokens):
            query_wf[term] = query_tokens.count(term)

        print(f"\nQuery word frequencies")
        print(query_wf)

    # Now we compute the similarity with each document
    print("\nComputing similarity of query with documents")
    for doc_number in range(len(doclist)):
        similarity = sim(query_wf, doc_number, inverse_index, idf)
        print(f"sim(q,d{doc_number}) = {similarity:.2f}")

    return None
Ejemplo n.º 2
0
def searchPage( url, searchTags):
    try:
        tokens = stemmer.stemmer(tokenizer.contentTokenizer(url))
        print "Calculating priority for : " + url
        token_set = set(tokens)
        tf = {}
        queryTf = {}
        for word in token_set:
            tf[word] = float(tokens.count(word)) / len(tokens)           
        #searchTags = stemmer.stemmer(tokenizer.stringTokenizer(string))
        for tag in searchTags:
            if tf.has_key(tag):
                queryTf[tag] = tf.get(tag)
            else:
                queryTf[tag] = 0
        priority = calcPriority(queryTf)
    except:
        print "Priority calculation error"
    #print "Priority for " + url + " - " + priority        
    return priority
            
    #print queryCount            
        
#print searchPage(url, "Big Bang theory cast sheldon")
#print searchPage(url, "Big armadillo hello lol this")
#print searchPage(url, "hello")

 
    
def search(path_to_index, queries):
    a = os.path.join(path_to_index, 'inverted_index')
    from stemmer import stemmer
    from stopwords import stopwords
    from search_handler import search
    stemmer = stemmer()
    stopwords = stopwords()
    search = search(a, stemmer, stopwords)
    outputs = []
    for query in queries:
        the_result = search.search(query)
        if the_result is None:
            outputs.append(['.'] * 10)
        else:
            outputs.append(the_result)
    return outputs
Ejemplo n.º 4
0
def searchPage(url, searchTags):
    try:
        tokens = stemmer.stemmer(tokenizer.contentTokenizer(url))
        print "Calculating priority for : " + url
        token_set = set(tokens)
        tf = {}
        queryTf = {}
        for word in token_set:
            tf[word] = float(tokens.count(word)) / len(tokens)
        #searchTags = stemmer.stemmer(tokenizer.stringTokenizer(string))
        for tag in searchTags:
            if tf.has_key(tag):
                queryTf[tag] = tf.get(tag)
            else:
                queryTf[tag] = 0
        priority = calcPriority(queryTf)
    except:
        print "Priority calculation error"
    #print "Priority for " + url + " - " + priority
    return priority
Ejemplo n.º 5
0
def build_vocabulary(doclist, stopwordfile):
    # Create an empty set for the vocabulary
    print("Building the vocabulary")
    vocabulary = set()
    document_tokens = []
    for doc in doclist:
        print(f"Loading document {doc}")
        with open(doc, 'r') as f:
            text = f.read()
            # Remove stop words
            text = stop_remover(text, stopwordfile)
            # stem
            tokens = stemmer(text)
            document_tokens.append(tokens)
            unique_tokens = set(tokens)
            print(f"{len(unique_tokens)} unique tokens")
            # Add the tokens to the vocab if they are not already there.
            vocabulary = vocabulary.union(unique_tokens)
            print(f"There are now {len(vocabulary)} words in the vocabulary\n")
    return vocabulary, document_tokens
def loginFormHandling():
    from stemmer import stemmer
    from stopwords import stopwords
    from search_handler import search
    stemmer = stemmer()
    stopwords = stopwords()
    search = search("path", stemmer, stopwords)
    data = request.form
    query = request.form['query']
    flag = 0
    if data['title'] != '':
        flag = 1
        query += " title:"
        query += data['title']
    if data['infobox'] != '':
        flag = 1
        query += " infobox:"
        query += data['infobox']
    if data['references'] != '':
        flag = 1
        query += " ref:"
        query += data['references']
    if data['category'] != '':
        flag = 1
        query += " category:"
        query += data['category']
    if data['links'] != '':
        flag = 1
        query += " links:"
        query += data['links']
    if data['body'] != '':
        flag = 1
        query += " body:"
        query += data['body']
    # if (flag == 1):
    # 	r = len(request.form ['query'])
    # 	query = query[r:]
    print(query)
    the_result = search.search(query)
    return render_template('results.html', query=query, results=the_result)
Ejemplo n.º 7
0
    return count


# Extensions to be avoided
excludedExtentions = [
    '.png', '.gif', '.jpg', '.jpeg', '.pdf', '.mp3', '.wmv', '.svg', '.ogg',
    '.jsp', '.ogv', '.py', '.tar.gz', '.css', '.ico', '.gz', '.ppt', '.zip',
    '.rar', '.ps', '.ppsx'
]
excludedExtensions = set(
    excludedExtentions)  # Making set for easy membership test

errorcount = 0
query = sys.argv[1]  # Initial query for focused crawler
urlcount = int(sys.argv[2])
searchTags = stemmer.stemmer(tokenizer.stringTokenizer(
    query))  # Stemming and tokenizing search query to avoid duplicate effort
print "Search tokens : "
print searchTags

top10urls = gQuery.googleSearch(query)

urls = MyPriorityQueue()  # Queue for storing URLs yet to be visisted
results = {}  # Dictionary structure for storing results URLs
visited = set()

for i in range(len(top10urls)):  # Adding top 10 results to Queue
    urls.put(top10urls[i], 1)

br = mechanize.Browser()
f = open('results.txt', 'w+')
Ejemplo n.º 8
0
import pickle, os, sys
import time
start = time.time()

from stemmer import stemmer
from stopwords import stopwords
from parser import parser

stemmer = stemmer()
stopwords = stopwords()

path_to_dump = sys.argv[1]
path_to_index_folder = sys.argv[2]

print (path_to_index_folder)

wikipedia_parser = parser (path_to_index_folder, stemmer, stopwords)

wikipedia_parser.parse (path_to_dump)
wikipedia_parser.make_alphabet_pairs ()
# wikipedia_parser.merge_files ()
# wikipedia_parser.divide_files ()
end = time.time()
print ("It took", end-start, "seconds")
Ejemplo n.º 9
0
def urlContains(url, searchTags):
    count = 0
    for tag in searchTags:
        if tag in url:
            count +=1
    return count
        

# Extensions to be avoided
excludedExtentions = ['.png', '.gif', '.jpg', '.jpeg', '.pdf', '.mp3', '.wmv', '.svg', '.ogg','.jsp', '.ogv', '.py', '.tar.gz', '.css', '.ico', '.gz', '.ppt', '.zip', '.rar', '.ps', '.ppsx']
excludedExtensions = set(excludedExtentions)    # Making set for easy membership test

errorcount = 0
query = sys.argv[1]  # Initial query for focused crawler
urlcount = int(sys.argv[2])
searchTags = stemmer.stemmer(tokenizer.stringTokenizer(query))  # Stemming and tokenizing search query to avoid duplicate effort
print "Search tokens : "
print searchTags

top10urls = gQuery.googleSearch(query)

urls = MyPriorityQueue()    # Queue for storing URLs yet to be visisted
results = {}    # Dictionary structure for storing results URLs
visited = set()

for i in range(len(top10urls)): # Adding top 10 results to Queue
    urls.put(top10urls[i],1)

br = mechanize.Browser()   
f = open('results.txt','w+')