def main(query, ranker):
    crawler_tuple = pk.open_pickle("crawler_tuple_pages.pkl")
    global inverted
    global documents
    global max_freq
    global cosine
    global N
    global len_dict

    for url in crawler_tuple.keys():
        N += 1
        a, b = url, crawler_tuple[url][1]
        documents.update({a: b})
        inverted_index(a, b)
        res = max(set(b), key=b.count)
        max_freq.update({a: (b.count(res))})
    len_dict = {}
    for docno, file in documents.items():
        val = tf_idf(docno, file)
        len_dict.update({docno: val})
    #print(len_dict)
    if (ranker == 'cosine'):
        return cosine_calc(query)
        #print("cosine ranking")
    elif (ranker == 'PageRank'):
        return page_rank_calc(query)
        #print('pagerank')


#print(len(main('computer','cosine')))
#print(len(main('computer','PageRank')))
def page_rank_calc(query):
    cleaned_query = process_query(query)
    pageranks = pk.open_pickle('qdpr.pkl')
    pr_rank = score(pageranks, cleaned_query)
    qdpr_rank = [
        k for k, v in sorted(pr_rank.items(), reverse=True, key=lambda k: k[1])
    ]
    #print(len(cosine_rank))
    return qdpr_rank
Esempio n. 3
0
 def open_peaks(self, filename):
     peaks = open_pickle(self.peaks, filename)
     self.set_peaks(peaks)
Esempio n. 4
0
# -*- coding: utf-8 -*-
"""
Created on Mon Apr 27 13:50:24 2020

@author: Sheetal
Python program that calculates the tfidf for each term and computes the inlinks
"""
import math
import pickle_functions as pk

if __name__ == "__main__":

    word_count = pk.open_pickle('word_count.pkl')
    vocab = pk.open_pickle('vocab.pkl')
    crawler_tuple = pk.open_pickle('crawler_tuple_pages.pkl')

    tfidf = {}
    N = len(word_count)  # N is the total number of webpages scarpped
    for url in word_count:
        tfidf[url] = {}
        for tokens in word_count[url]:
            tf = word_count[url][tokens] / (max(
                word for word in word_count[url].values()))

            idf = math.log((N / vocab[tokens]), 2)

            tfidf[url][tokens] = tf * idf
    #print(tfidf)

    pk.save_pickle('tfidf.pkl', tfidf)
    inlink = {}
Esempio n. 5
0
                        a_tags = bs.find_all('a')
                        for a in a_tags:
                            try:
                                if(re.search('.+?uic.edu',a["href"]) != None):
                                    if not any(ext in a["href"] for ext in skip_exten):
                                        parse = urlparse(a["href"])
                                        temp_href = ((parse.netloc+parse.path).lstrip("www.").rstrip("/"))
                                        if(uic_domain in a["href"] and temp_href not in links_dict.values() and temp_href not in visited):
                                            url_queue.append(temp_href)
                                                
                            except:
                                continue
                        
                        print(page_num)
                        if(page_num>search_limit):
                            break
                except:
                    print("Connection failed for ", url)
                    continue
        web_crawler = scrape(visited,vocab) 
        pk.save_pickle('crawler_tuple_pages.pkl',web_crawler)
        pk.save_pickle('word_count.pkl',word_count)
        pk.save_pickle('vocab.pkl',vocab)  
        pk.save_pickle('page_content.pkl',page_content)

    else:
        web_crawler = pk.open_pickle('crawler_tuple_pages.pkl')
        word_count = pk.open_pickle('word_count.pkl')
        vocab = pk.open_pickle('vocab.pkl')
        page_content = pk.open_pickle('page_content.pkl')
Esempio n. 6
0
    while (count < 10):
        for url in tfidf:
            for token in tfidf[url]:
                s = 0
                for i in inlink[url]:
                    s += (qdpr_dict[i][token] if token in qdpr_dict[i] else
                          0) * pqitoj(token, i, url, tfidf)
                prQuery = tfidf[url][token] / sum(
                    tfidf[i][token] if token in tfidf[i] else 0 for i in tfidf)
                qdpr_dict[url][token] = (1 - df) * prQuery + (df * s)
        count += 1
    return qdpr_dict


if __name__ == "__main__":

    crawler_tuple = pk.open_pickle("crawler_tuple_pages.pkl")
    tfidf = pk.open_pickle("tfidf.pkl")

    if os.path.exists('inlink.pkl'):
        inlink = pk.open_pickle("inlink.pkl")

    else:
        inlink = inlinkFunc(tfidf, crawler_tuple)
        pk.save_pickle("inlink.pkl", inlink)

    qdpr_fin = qdpr(tfidf, crawler_tuple, inlink)
    print(qdpr_fin)
    pk.save_pickle("qdpr.pkl", qdpr_fin)
    qr = pk.open_pickle('qdpr.pkl')