def __init__(
        self,
        filename,
        saver,
        loader,
        create_lexicon_flag=False,
        cluster_results=False,
        num_clusters=3,
        tf_idf_flag=True,
        ah_flag=False,
        pr_flag=False,
        normalize=True,
        create_page_rank_flag=False,
        directory="../index",
        linksFile="../index/IntLinks.txt",
        citationsFile="../index/IntCitations.txt",
        n_retrieves=10,
        root_set_size=10,
        maxIter=10,
        verbose=False,
    ):
        """ The init function loads up the pickled tf lexicon, normalizers and the norms of all the documents. 
		Also this is the function that setsup the entire class incuding all its flags  """

        self.normalize = normalize
        self.tf_idf_flag = tf_idf_flag
        self.ah_flag = ah_flag
        self.pr_flag = pr_flag
        self.n_retrieves = n_retrieves
        self.root_set_size = root_set_size
        self.maxIter = maxIter
        self.cluster_results = cluster_results
        self.num_clusters = num_clusters
        assert self.root_set_size >= self.n_retrieves

        directory = SimpleFSDirectory(File("../index"))
        self.reader = IndexReader.open(directory)
        self.numDocs = self.reader.maxDoc()

        if self.normalize is False:
            self.norm = None

            # TF and/or TF IDF part.
        if create_lexicon_flag is True:
            if normalize is True:  # make second flag True if you want to create a normalizer also.
                # but assumed created from Create Lexicon file
                if verbose is True:
                    print "extracting all the norms of docs"
                    start_time = time.clock()
                self.norm = calculateNormalizer(reader=self.reader, verbose=verbose)
                if verbose is True:
                    end_time = time.clock()
                    print "time taken for calculating norms is : " + str(end_time - start_time) + " seconds"
                saver(filename=filename + "_norms", obj=self.norm)
            self.lexicon = createLexicon(
                filename=filename,
                reader=self.reader,
                norm=self.norm if self.normalize else None,
                tf_idf_flag=self.tf_idf_flag,
                verbose=verbose,
            )
            saver(filename=filename + "_lexicon", obj=self.lexicon)

        else:
            self.lexicon = loader(filename + "_lexicon")
            if normalize is True:
                if verbose is True:
                    print "loading norms"
                    start_time = time.clock()
                self.norm = loader(filename=filename + "_norms")
                if verbose is True:
                    end_time = time.clock()
                    print "time taken for loading norms is : " + str(end_time - start_time) + " seconds"

                    # Authorties and Hubs part.
        if self.ah_flag or self.pr_flag is True:
            self.graph = LinkAnalysis(
                linksFile=linksFile, citationsFile=citationsFile, reader=self.reader, verbose=verbose
            )
            if pr_flag is True:
                # pre-calculate page rank
                if create_page_rank_flag is True:  # make this a create_page_rank_flag
                    self.pr_values = page_rank(
                        alpha=0.1,
                        maxIter=self.maxIter,
                        numDocs=self.numDocs,
                        graph=self.graph,
                        saver=saver,
                        verbose=True,
                    )
                else:
                    # if already calculated simply load
                    self.pr_values = loader("rank")
Example #2
0
    def __init__(self,
                 filename,
                 saver,
                 loader,
                 create_lexicon_flag=False,
                 cluster_results=False,
                 num_clusters=3,
                 tf_idf_flag=True,
                 ah_flag=False,
                 pr_flag=False,
                 normalize=True,
                 create_page_rank_flag=False,
                 directory='../index',
                 linksFile="../index/IntLinks.txt",
                 citationsFile="../index/IntCitations.txt",
                 n_retrieves=10,
                 root_set_size=10,
                 maxIter=10,
                 verbose=False):
        """ The init function loads up the pickled tf lexicon, normalizers and the norms of all the documents. 
		Also this is the function that setsup the entire class incuding all its flags  """

        self.normalize = normalize
        self.tf_idf_flag = tf_idf_flag
        self.ah_flag = ah_flag
        self.pr_flag = pr_flag
        self.n_retrieves = n_retrieves
        self.root_set_size = root_set_size
        self.maxIter = maxIter
        self.cluster_results = cluster_results
        self.num_clusters = num_clusters
        assert self.root_set_size >= self.n_retrieves

        directory = SimpleFSDirectory(File('../index'))
        self.reader = IndexReader.open(directory)
        self.numDocs = self.reader.maxDoc()

        if self.normalize is False:
            self.norm = None

        # TF and/or TF IDF part.
        if create_lexicon_flag is True:
            if normalize is True:  # make second flag True if you want to create a normalizer also.
                # but assumed created from Create Lexicon file
                if verbose is True:
                    print "extracting all the norms of docs"
                    start_time = time.clock()
                self.norm = calculateNormalizer(reader=self.reader,
                                                verbose=verbose)
                if verbose is True:
                    end_time = time.clock()
                    print "time taken for calculating norms is : " + str(
                        end_time - start_time) + " seconds"
                saver(filename=filename + '_norms', obj=self.norm)
            self.lexicon = createLexicon(
                filename=filename,
                reader=self.reader,
                norm=self.norm if self.normalize else None,
                tf_idf_flag=self.tf_idf_flag,
                verbose=verbose)
            saver(filename=filename + '_lexicon', obj=self.lexicon)

        else:
            self.lexicon = loader(filename + '_lexicon')
            if normalize is True:
                if verbose is True:
                    print "loading norms"
                    start_time = time.clock()
                self.norm = loader(filename=filename + '_norms')
                if verbose is True:
                    end_time = time.clock()
                    print "time taken for loading norms is : " + str(
                        end_time - start_time) + " seconds"

        # Authorties and Hubs part.
        if self.ah_flag or self.pr_flag is True:
            self.graph = LinkAnalysis(linksFile=linksFile,
                                      citationsFile=citationsFile,
                                      reader=self.reader,
                                      verbose=verbose)
            if pr_flag is True:
                # pre-calculate page rank
                if create_page_rank_flag is True:  # make this a create_page_rank_flag
                    self.pr_values = page_rank(alpha=0.1,
                                               maxIter=self.maxIter,
                                               numDocs=self.numDocs,
                                               graph=self.graph,
                                               saver=saver,
                                               verbose=True)
                else:
                    # if already calculated simply load
                    self.pr_values = loader('rank')
class search(object):
    """ This is the main class that is going to initiate the search engine """

    def __init__(
        self,
        filename,
        saver,
        loader,
        create_lexicon_flag=False,
        cluster_results=False,
        num_clusters=3,
        tf_idf_flag=True,
        ah_flag=False,
        pr_flag=False,
        normalize=True,
        create_page_rank_flag=False,
        directory="../index",
        linksFile="../index/IntLinks.txt",
        citationsFile="../index/IntCitations.txt",
        n_retrieves=10,
        root_set_size=10,
        maxIter=10,
        verbose=False,
    ):
        """ The init function loads up the pickled tf lexicon, normalizers and the norms of all the documents. 
		Also this is the function that setsup the entire class incuding all its flags  """

        self.normalize = normalize
        self.tf_idf_flag = tf_idf_flag
        self.ah_flag = ah_flag
        self.pr_flag = pr_flag
        self.n_retrieves = n_retrieves
        self.root_set_size = root_set_size
        self.maxIter = maxIter
        self.cluster_results = cluster_results
        self.num_clusters = num_clusters
        assert self.root_set_size >= self.n_retrieves

        directory = SimpleFSDirectory(File("../index"))
        self.reader = IndexReader.open(directory)
        self.numDocs = self.reader.maxDoc()

        if self.normalize is False:
            self.norm = None

            # TF and/or TF IDF part.
        if create_lexicon_flag is True:
            if normalize is True:  # make second flag True if you want to create a normalizer also.
                # but assumed created from Create Lexicon file
                if verbose is True:
                    print "extracting all the norms of docs"
                    start_time = time.clock()
                self.norm = calculateNormalizer(reader=self.reader, verbose=verbose)
                if verbose is True:
                    end_time = time.clock()
                    print "time taken for calculating norms is : " + str(end_time - start_time) + " seconds"
                saver(filename=filename + "_norms", obj=self.norm)
            self.lexicon = createLexicon(
                filename=filename,
                reader=self.reader,
                norm=self.norm if self.normalize else None,
                tf_idf_flag=self.tf_idf_flag,
                verbose=verbose,
            )
            saver(filename=filename + "_lexicon", obj=self.lexicon)

        else:
            self.lexicon = loader(filename + "_lexicon")
            if normalize is True:
                if verbose is True:
                    print "loading norms"
                    start_time = time.clock()
                self.norm = loader(filename=filename + "_norms")
                if verbose is True:
                    end_time = time.clock()
                    print "time taken for loading norms is : " + str(end_time - start_time) + " seconds"

                    # Authorties and Hubs part.
        if self.ah_flag or self.pr_flag is True:
            self.graph = LinkAnalysis(
                linksFile=linksFile, citationsFile=citationsFile, reader=self.reader, verbose=verbose
            )
            if pr_flag is True:
                # pre-calculate page rank
                if create_page_rank_flag is True:  # make this a create_page_rank_flag
                    self.pr_values = page_rank(
                        alpha=0.1,
                        maxIter=self.maxIter,
                        numDocs=self.numDocs,
                        graph=self.graph,
                        saver=saver,
                        verbose=True,
                    )
                else:
                    # if already calculated simply load
                    self.pr_values = loader("rank")

    def retrieve(self, pr_weight=0.4, verbose=True):
        """ This function retrieves
		Performs the steps in slide 31 lecture 3 
		"""
        ## ..
        start_time = time.clock()
        sim = [float(0)] * self.reader.maxDoc()

        # initialized all sims to 0
        if verbose is True:
            print "estimating query features"
        q_feat = self.query.tf_idf() if self.tf_idf_flag is True else self.query.tf()
        i = 0

        if verbose is True:
            print "calculating similarities"

        for term in self.query.query:
            if verbose is True:
                print "calculating similarities for term " + str(term.text())

            if str(term.text()) not in self.lexicon.keys():
                print str(term.text()) + " is not in the index, so skipping it."
                break

            I = self.lexicon[str(term.text())]  # extract the lexicon of the term
            for doc_id, doc_feat in I:  # for every document that carries the term
                sim[int(doc_id)] = sim[int(doc_id)] + q_feat[i] * doc_feat
            i = i + 1

        if self.normalize is True:
            if verbose is True:
                print "normalizing "
            I = self.lexicon[str(term.text())]  # extract the lexicon of the term
            for doc_id, feat in I:
                sim[int(doc_id)] = sim[int(doc_id)] / self.norm[int(doc_id)]

        if verbose is True:
            print "sorting"
        idx = sorted(range(len(sim)), key=lambda k: sim[k], reverse=True)
        end_time = time.clock()
        print "vector spcae time " + str(end_time - start_time) + " seconds"

        if self.ah_flag is True:
            # re rank idx by authorities and hubs and create a new idx
            start_time = time.clock()
            if verbose is True:
                print "estimating root set"
            assert pr_flag is False

            link_adj = {}
            citation_adj = {}

            root_set = list(idx[0 : self.root_set_size])
            base_set = list(root_set)

            if verbose is True:
                print "growing base set"
                count = 0

            for root_node in root_set:
                if verbose is True:
                    count = count + 1
                    print "growing base for node " + str(count) + " - " + str(root_node)

                if not root_node in link_adj.keys():
                    link_adj[root_node] = list([])
                if not root_node in citation_adj.keys():
                    citation_adj[root_node] = list([])

                fwd_links = list(self.graph.getLinks(root_node))
                if verbose is True:
                    print "number of forward links for " + str(root_node) + " is " + str(len(fwd_links))

                for fwd_link in fwd_links:

                    if verbose is True:
                        print "processing link " + str(fwd_link)

                    if not fwd_link in base_set:
                        base_set.append(fwd_link)
                        # grow base set and mark in adjacency matrix
                    if not fwd_link in citation_adj.keys():
                        citation_adj[fwd_link] = list([])
                    if not fwd_link in link_adj.keys():
                        link_adj[fwd_link] = list([])

                        # citation_adj[fwd_link].append(root_node)
                    link_adj[root_node].append(fwd_link)

                back_links = list(self.graph.getCitations(root_node))
                if verbose is True:
                    print "number of backward links for " + str(root_node) + " is " + str(len(back_links))

                for back_link in back_links:

                    if verbose is True:
                        print "processing link " + str(back_link)

                    if not back_link in base_set:
                        base_set.append(back_link)

                    if not back_link in link_adj.keys():
                        link_adj[back_link] = list([])
                    if not back_link in citation_adj.keys():
                        citation_adj[back_link] = list([])

                        # link_adj[back_link].append(root_node)
                    citation_adj[root_node].append(back_link)

            if verbose is True:
                print "size of base set is " + str(len(base_set))
                print "size of citation adjacency is " + str(len(citation_adj.keys()))
                print "size of link adjacency is " + str(len(link_adj.keys()))
                # calcualte hubs and authorities
            auth_score, hub_score = authorities_hubs(
                numDocs=self.numDocs, adj=(link_adj, citation_adj), nodes=base_set, maxIter=self.maxIter, verbose=False
            )

            auth_idx = sorted(range(len(auth_score)), key=lambda k: auth_score[k], reverse=True)
            hub_idx = sorted(range(len(hub_score)), key=lambda k: hub_score[k], reverse=True)

            end_time = time.clock()
            print "authorities and hubs took " + str(end_time - start_time) + " seconds"

        elif self.pr_flag is True:
            start_time = time.clock()

            # calcualte page rank score ...
            sim_new = page_rank_score(weight=pr_weight, similarities=sim, pr_val=self.pr_values, verbose=verbose)

            idx_new = sorted(range(len(sim_new)), key=lambda k: sim_new[k], reverse=True)
            end_time = time.clock()
            print "page rank took " + str(end_time - start_time) + " seconds"

        if self.cluster_results is True:
            words = cluster(
                doc_ids=idx[0 : self.n_retrieves],
                lexicon=self.lexicon,
                r=self.reader,
                num_clusters=self.num_clusters,
                verbose=verbose,
            )

        if self.ah_flag is True:
            return (idx, sim, auth_idx, auth_score, hub_idx, hub_score)

        elif self.pr_flag is True:
            return (idx, sim, idx_new, sim_new)

        else:
            return (idx, sim)

    def run(self, query, print_urls=True, pr_weight=0.4, verbose=False):
        """this function basically runs a query"""
        self.query = parse_query(query, self.reader)
        start_time = time.clock()

        if self.ah_flag is True:
            doc_ids, score, auth_ids, auth_score, hub_ids, hub_score = self.retrieve(verbose=verbose)
        elif self.pr_flag is True:
            doc_ids, score, pr_ids, pr = self.retrieve(pr_weight=pr_weight, verbose=verbose)
        else:
            doc_ids, score = self.retrieve(verbose=verbose)

        end_time = time.clock()
        print "in total  " + str(end_time - start_time) + " seconds for retrieval"
        if print_urls is True:
            print "vector space retreival"
            for i in xrange(self.n_retrieves):
                d = self.reader.document(doc_ids[i])
                print "doc: [" + str(doc_ids[i]) + "], score: [" + str(score[doc_ids[i]]) + "]"
                # , url: " + d.getFieldable("path").stringValue().replace("%%", "/")

            if self.ah_flag is True:
                print "authorities based retreival"
                for i in xrange(self.n_retrieves):
                    d = self.reader.document(auth_ids[i])
                    print "doc: [" + str(auth_ids[i]) + "], score: [" + str(auth_score[auth_ids[i]]) + "]"
                    # , url: " + d.getFieldable("path").stringValue().replace("%%", "/")

                print "hubs based retreival"
                for i in xrange(self.n_retrieves):
                    d = self.reader.document(hub_ids[i])
                    print "doc: [" + str(hub_ids[i]) + "], score: [" + str(hub_score[hub_ids[i]]) + "]"
                    # , url: " + d.getFieldable("path").stringValue().replace("%%", "/")

            elif self.pr_flag is True:
                print "page rank based retreival"
                for i in xrange(self.n_retrieves):
                    d = self.reader.document(pr_ids[i])
                    print "doc: [" + str(pr_ids[i]) + "], score: [" + str(pr[pr_ids[i]]) + "]"
                    # , url: " + d.getFieldable("path").stringValue().replace("%%", "/")

        print "retrieval complete. "
        print "..........................................................................."
        return d
Example #4
0
class search(object):
    """ This is the main class that is going to initiate the search engine """
    def __init__(self,
                 filename,
                 saver,
                 loader,
                 create_lexicon_flag=False,
                 cluster_results=False,
                 num_clusters=3,
                 tf_idf_flag=True,
                 ah_flag=False,
                 pr_flag=False,
                 normalize=True,
                 create_page_rank_flag=False,
                 directory='../index',
                 linksFile="../index/IntLinks.txt",
                 citationsFile="../index/IntCitations.txt",
                 n_retrieves=10,
                 root_set_size=10,
                 maxIter=10,
                 verbose=False):
        """ The init function loads up the pickled tf lexicon, normalizers and the norms of all the documents. 
		Also this is the function that setsup the entire class incuding all its flags  """

        self.normalize = normalize
        self.tf_idf_flag = tf_idf_flag
        self.ah_flag = ah_flag
        self.pr_flag = pr_flag
        self.n_retrieves = n_retrieves
        self.root_set_size = root_set_size
        self.maxIter = maxIter
        self.cluster_results = cluster_results
        self.num_clusters = num_clusters
        assert self.root_set_size >= self.n_retrieves

        directory = SimpleFSDirectory(File('../index'))
        self.reader = IndexReader.open(directory)
        self.numDocs = self.reader.maxDoc()

        if self.normalize is False:
            self.norm = None

        # TF and/or TF IDF part.
        if create_lexicon_flag is True:
            if normalize is True:  # make second flag True if you want to create a normalizer also.
                # but assumed created from Create Lexicon file
                if verbose is True:
                    print "extracting all the norms of docs"
                    start_time = time.clock()
                self.norm = calculateNormalizer(reader=self.reader,
                                                verbose=verbose)
                if verbose is True:
                    end_time = time.clock()
                    print "time taken for calculating norms is : " + str(
                        end_time - start_time) + " seconds"
                saver(filename=filename + '_norms', obj=self.norm)
            self.lexicon = createLexicon(
                filename=filename,
                reader=self.reader,
                norm=self.norm if self.normalize else None,
                tf_idf_flag=self.tf_idf_flag,
                verbose=verbose)
            saver(filename=filename + '_lexicon', obj=self.lexicon)

        else:
            self.lexicon = loader(filename + '_lexicon')
            if normalize is True:
                if verbose is True:
                    print "loading norms"
                    start_time = time.clock()
                self.norm = loader(filename=filename + '_norms')
                if verbose is True:
                    end_time = time.clock()
                    print "time taken for loading norms is : " + str(
                        end_time - start_time) + " seconds"

        # Authorties and Hubs part.
        if self.ah_flag or self.pr_flag is True:
            self.graph = LinkAnalysis(linksFile=linksFile,
                                      citationsFile=citationsFile,
                                      reader=self.reader,
                                      verbose=verbose)
            if pr_flag is True:
                # pre-calculate page rank
                if create_page_rank_flag is True:  # make this a create_page_rank_flag
                    self.pr_values = page_rank(alpha=0.1,
                                               maxIter=self.maxIter,
                                               numDocs=self.numDocs,
                                               graph=self.graph,
                                               saver=saver,
                                               verbose=True)
                else:
                    # if already calculated simply load
                    self.pr_values = loader('rank')

    def retrieve(self, pr_weight=0.4, verbose=True):
        """ This function retrieves
		Performs the steps in slide 31 lecture 3 
		"""
        ## ..
        start_time = time.clock()
        sim = [float(0)] * self.reader.maxDoc()

        # initialized all sims to 0
        if verbose is True:
            print "estimating query features"
        q_feat = self.query.tf_idf(
        ) if self.tf_idf_flag is True else self.query.tf()
        i = 0

        if verbose is True:
            print "calculating similarities"

        for term in self.query.query:
            if verbose is True:
                print "calculating similarities for term " + str(term.text())

            if str(term.text()) not in self.lexicon.keys():
                print str(
                    term.text()) + " is not in the index, so skipping it."
                break

            I = self.lexicon[str(
                term.text())]  # extract the lexicon of the term
            for doc_id, doc_feat in I:  # for every document that carries the term
                sim[int(doc_id)] = sim[int(doc_id)] + q_feat[i] * doc_feat
            i = i + 1

        if self.normalize is True:
            if verbose is True:
                print "normalizing "
            I = self.lexicon[str(
                term.text())]  # extract the lexicon of the term
            for doc_id, feat in I:
                sim[int(doc_id)] = sim[int(doc_id)] / self.norm[int(doc_id)]

        if verbose is True:
            print "sorting"
        idx = sorted(range(len(sim)), key=lambda k: sim[k], reverse=True)
        end_time = time.clock()
        print "vector spcae time " + str(end_time - start_time) + " seconds"

        if self.ah_flag is True:
            # re rank idx by authorities and hubs and create a new idx
            start_time = time.clock()
            if verbose is True:
                print "estimating root set"
            assert pr_flag is False

            link_adj = {}
            citation_adj = {}

            root_set = list(idx[0:self.root_set_size])
            base_set = list(root_set)

            if verbose is True:
                print "growing base set"
                count = 0

            for root_node in root_set:
                if verbose is True:
                    count = count + 1
                    print "growing base for node " + str(count) + " - " + str(
                        root_node)

                if not root_node in link_adj.keys():
                    link_adj[root_node] = list([])
                if not root_node in citation_adj.keys():
                    citation_adj[root_node] = list([])

                fwd_links = list(self.graph.getLinks(root_node))
                if verbose is True:
                    print "number of forward links for " + str(
                        root_node) + " is " + str(len(fwd_links))

                for fwd_link in fwd_links:

                    if verbose is True:
                        print "processing link " + str(fwd_link)

                    if not fwd_link in base_set:
                        base_set.append(fwd_link)
                        # grow base set and mark in adjacency matrix
                    if not fwd_link in citation_adj.keys():
                        citation_adj[fwd_link] = list([])
                    if not fwd_link in link_adj.keys():
                        link_adj[fwd_link] = list([])

                    #citation_adj[fwd_link].append(root_node)
                    link_adj[root_node].append(fwd_link)

                back_links = list(self.graph.getCitations(root_node))
                if verbose is True:
                    print "number of backward links for " + str(
                        root_node) + " is " + str(len(back_links))

                for back_link in back_links:

                    if verbose is True:
                        print "processing link " + str(back_link)

                    if not back_link in base_set:
                        base_set.append(back_link)

                    if not back_link in link_adj.keys():
                        link_adj[back_link] = list([])
                    if not back_link in citation_adj.keys():
                        citation_adj[back_link] = list([])

                    #link_adj[back_link].append(root_node)
                    citation_adj[root_node].append(back_link)

            if verbose is True:
                print "size of base set is " + str(len(base_set))
                print "size of citation adjacency is " + str(
                    len(citation_adj.keys()))
                print "size of link adjacency is " + str(len(link_adj.keys()))
            # calcualte hubs and authorities
            auth_score, hub_score = authorities_hubs(numDocs=self.numDocs,
                                                     adj=(link_adj,
                                                          citation_adj),
                                                     nodes=base_set,
                                                     maxIter=self.maxIter,
                                                     verbose=False)

            auth_idx = sorted(range(len(auth_score)),
                              key=lambda k: auth_score[k],
                              reverse=True)
            hub_idx = sorted(range(len(hub_score)),
                             key=lambda k: hub_score[k],
                             reverse=True)

            end_time = time.clock()
            print "authorities and hubs took " + str(end_time -
                                                     start_time) + " seconds"

        elif self.pr_flag is True:
            start_time = time.clock()

            # calcualte page rank score ...
            sim_new = page_rank_score(weight=pr_weight,
                                      similarities=sim,
                                      pr_val=self.pr_values,
                                      verbose=verbose)

            idx_new = sorted(range(len(sim_new)),
                             key=lambda k: sim_new[k],
                             reverse=True)
            end_time = time.clock()
            print "page rank took " + str(end_time - start_time) + " seconds"

        if self.cluster_results is True:
            words = cluster(doc_ids=idx[0:self.n_retrieves],
                            lexicon=self.lexicon,
                            r=self.reader,
                            num_clusters=self.num_clusters,
                            verbose=verbose)

        if self.ah_flag is True:
            return (idx, sim, auth_idx, auth_score, hub_idx, hub_score)

        elif self.pr_flag is True:
            return (idx, sim, idx_new, sim_new)

        else:
            return (idx, sim)

    def run(self, query, print_urls=True, pr_weight=0.4, verbose=False):
        """this function basically runs a query"""
        self.query = parse_query(query, self.reader)
        start_time = time.clock()

        if self.ah_flag is True:
            doc_ids, score, auth_ids, auth_score, hub_ids, hub_score = self.retrieve(
                verbose=verbose)
        elif self.pr_flag is True:
            doc_ids, score, pr_ids, pr = self.retrieve(pr_weight=pr_weight,
                                                       verbose=verbose)
        else:
            doc_ids, score = self.retrieve(verbose=verbose)

        end_time = time.clock()
        print "in total  " + str(end_time -
                                 start_time) + " seconds for retrieval"
        if print_urls is True:
            print "vector space retreival"
            for i in xrange(self.n_retrieves):
                d = self.reader.document(doc_ids[i])
                print "doc: [" + str(doc_ids[i]) + "], score: [" + str(
                    score[doc_ids[i]]) + "]"
                #, url: " + d.getFieldable("path").stringValue().replace("%%", "/")

            if self.ah_flag is True:
                print "authorities based retreival"
                for i in xrange(self.n_retrieves):
                    d = self.reader.document(auth_ids[i])
                    print "doc: [" + str(auth_ids[i]) + "], score: [" + str(
                        auth_score[auth_ids[i]]) + "]"
                    #, url: " + d.getFieldable("path").stringValue().replace("%%", "/")

                print "hubs based retreival"
                for i in xrange(self.n_retrieves):
                    d = self.reader.document(hub_ids[i])
                    print "doc: [" + str(hub_ids[i]) + "], score: [" + str(
                        hub_score[hub_ids[i]]) + "]"
                    #, url: " + d.getFieldable("path").stringValue().replace("%%", "/")

            elif self.pr_flag is True:
                print "page rank based retreival"
                for i in xrange(self.n_retrieves):
                    d = self.reader.document(pr_ids[i])
                    print "doc: [" + str(pr_ids[i]) + "], score: [" + str(
                        pr[pr_ids[i]]) + "]"
                    #, url: " + d.getFieldable("path").stringValue().replace("%%", "/")

        print "retrieval complete. "
        print "..........................................................................."
        return d