Ejemplo n.º 1
0
def show_layers_effect(query_set, searcher, layers_combs):
	'''
	Shows performance for different layers configuration.
	'''

	queries = load_query_set(query_set, 40)

	legend = {'P': 'papers_relev',
						'A': 'authors_relev',
						'K': 'words_relev',
						'V': 'venues_relev'}

	params = {'age_relev':   0.0,
						'query_relev': 0.0,
						'ctx_relev':   0.0}

	for layers in layers_combs :

		print "%s\t" % layers,

		for layer in legend:
			params[legend[layer]] = 1.0 if (layer in layers) else 0.0

		searcher.set_params(**params)
		get_search_metrics(queries, searcher, show=True, force=True,
											 results_file=("%s/results/layers_effect/%s.p" % (config.DATA, layers)))

	print
Ejemplo n.º 2
0
def save_scholar_results(query_set, n):
    '''
	For each given query, request on google scholar, search for textually similar
	entries on the index and show them to the user for confirmation.
	'''
    from scholar_api import ScholarQuerier, SearchScholarQuery

    queries = load_query_set(query_set)

    querier = ScholarQuerier()
    scholar_query = SearchScholarQuery()

    # Folder to store saved results
    # from_folder = config.QUERY_SETS_PATH + "manual"
    save_folder = "%s/scholar/%s" % (config.DATA, config.DATASET)

    for query, query_id, _, _, _, _ in queries:

        # Only searches if file doesn't already exists
        file_path = "%s/%s.txt" % (save_folder, query_id)
        if os.path.exists(file_path):
            continue

        time.sleep(1)

        print "\nProcessing '%s'" % query,
        scholar_query.set_words(query)

        # We stop requesting new pages once we found at least n
        start = 0
        titles = []
        while (len(titles) < n):

            scholar_query.set_start(start)
            querier.send_query(scholar_query)

            # Check if we got a captcha as response
            if (len(querier.articles) == 0):
                if (start == 0):
                    raise Exception(
                        "Request probably got blocked due to overload.")
                else:
                    # If we got 0 article after some requests it may be that
                    # all articles were fetched. Skip to next query.
                    break

            # Get only titles and try to find entries in our dataset for them
            for article in querier.articles:
                titles.append(article['title'].strip('. '))

            # Set correct pagination
            start += 20

        # Write to file
        with open(file_path, 'a') as file:
            print >> file, query
            for title in titles:
                print >> file, "%s" % (title.encode("UTF-8"))
Ejemplo n.º 3
0
def save_layers_results_query_set(query_set) :

	queries = load_query_set(query_set, 10)
	queries = [query for query,_,_,_,_,_ in queries]

	layers = ['paper',
						'author',
						'venue',
						'ngram']

	searcher = Searcher(**PARAMS)

	for layer in layers :

		folder = "%s/results/layers/%s/%s/" % (config.DATA, config.DATASET, layer)
		get_layer_results(queries, searcher, folder, layer)
Ejemplo n.º 4
0
def save_aminer_results(query_set):

    queries = load_query_set(query_set, limit=100)
    for query, pub_id, _, _, _, _ in queries:

        #		folder = config.DATA + "aminer/" + query_set

        folder = "%s/aminer/%s" % (config.DATA, config.DATASET)
        if (not os.path.exists(folder)):
            os.makedirs(folder)

        # Only searches if file doesn't already exists
        file_path = "%s/%s.txt" % (folder, pub_id)
        if os.path.exists(file_path):
            continue

        titles = search_aminer(query, 20)

        print "%d\t%s" % (len(titles), query)
        with open(file_path, 'w') as file:
            print >> file, query
            for title in titles:
                print >> file, "%s" % (title.encode("UTF-8"))
Ejemplo n.º 5
0
def main() :

#	for qs in ["manual", "surveys", "testing"] :
#		print "\n%s" % qs
#		time_diversity(["MultiLayered", "TopCited(G)"], qs)

#	sys.exit()

#	config.IN_MODELS_FOLDER = config.DATA + "topic_models/%s_%d_%d"
##	check_topics_effect(Searcher(**PARAMS), "manual")
#	show_layers_effect("manual", Searcher(**PARAMS), ["PAK", "PAT", "PAKT"][:1])

#	save_layers_results_queries(["citation recommendation",
#															 "author recommendation",
#															 "link prediction"],
#															 folder="/var/tmp/results")
#	save_layer_results("manual")

#	query_set = 'manual'
#	query_set = 'surveys'
#	query_set = 'tuning'
#	query_set = 'testing'
#	queries = load_query_set(query_set, 200)

#	vary_parameters(Searcher(**PARAMS), queries, 'age_relev', [0.0, 0.001, 0.01, 0.1, 0.25, 0.5, 1.0])
#	vary_parameters(Searcher(**PARAMS), queries, 'ctx_relev', [0.0, 0.1, 0.25, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0])
#	vary_parameters(Searcher(**PARAMS), queries, 'query_relev', [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.8])

#	layers = ["P","PA","PV","PK","PAV","PAK","PKV","PAKV"]
#	show_layers_effect(queries, Searcher(**BEST_PARAMS), layers)

#	show_attenuators_effect(queries, Searcher(**PARAMS))

#	vary_rho_values(Searcher(**PARAMS), queries, 'papers_relev',  [0.05, 0.1, 0.25, 0.5, 0.75, 0.85, 0.9, 1.0])
#	vary_rho_values(Searcher(**PARAMS), queries, 'authors_relev', [0.0, 0.1, 0.25, 0.5, 0.75, 0.9])
#	vary_rho_values(Searcher(**PARAMS), queries, 'topics_relev',  [0.0, 0.1, 0.25, 0.5, 0.75, 0.9])
#	vary_rho_values(Searcher(**PARAMS), queries, 'words_relev',   [0.0, 0.1, 0.25, 0.5, 0.75, 0.9])
#	vary_rho_values(Searcher(**BEST_PARAMS), queries, 'venues_relev',  [0.0, 0.05, 0.1, 0.25, 0.5])

#	get_other_layers(load_query_set('manual', 10), Searcher(**BEST_PARAMS), "ngram")


	query_sets = [
							# 'manual',
							# 'surveys',
							# 'tuning',
							'testing'
							]

	searchers = [
						Searcher(**PARAMS),
						# Searcher(**config.PARAMS),
						# PageRankSubgraphSearcher(**PARAMS),
						# TopCitedSubgraphSearcher(**PARAMS),
						# TopCitedGlobalSearcher(),
						# TFIDFSearcher(),
						# BM25Searcher(),
						# CiteRankSearcher(tau=2.6),
						# PageRankFilterBeforeSearcher(),
						# PageRankFilterAfterSearcher(),
#						GoogleScholarSearcher(),
#						ArnetMinerSearcher(),
						#MengSearcher(),
#						CiteseerSearcher("eval/citeseer"),
						# WeightedTopCitedSubgraphSearcher(**PARAMS)
					]

	for query_set in query_sets :

		log.info("Running '%s' query set.\n" % query_set)

		queries = load_query_set(query_set, 200)
		for s in searchers :
			print "%s\t" % s.name(),
#			print "\nRunning %s with %d queries from %s set..." % \
#																	(s.name(), len(queries), query_set)
			if s.name() == "MultiLayered":
				s.set_params(**{
							  'K': 20,
						      'H': 1,
							  'papers_relev': 0.25,
							  'authors_relev': 0.25,
						   	  'words_relev': 0.25,
							  'topics_relev' : 0.0,
							  'venues_relev': 0.25,
							  'alpha': 0.3,
							  'query_relev': 0.3,
							  'age_relev': 0.01,
   							  'ctx_relev': 0.5})

			if s.name() == "TopCited(G)": # TopCitedSubgraphSearcher
				s.set_params(**{
							  'K': 20,
						      'H': 1,
							  'papers_relev': 0.25,
							  'authors_relev': 0.25,
						   	  'words_relev': 0.25,
							  'topics_relev' : 0.0,
							  'venues_relev': 0.25,
							  'alpha': 0.3,
							  'query_relev': 0.3,
							  'age_relev': 0.01,
   							  'ctx_relev': 0.5})


			if s.name() == "WeightedTopCited(G)":
				s.set_params(**{
							  'K': 20,
						      'H': 1,
							  'query_relev': 0.15,  # 0.15
						      'age_relev': 0.01, # 0.01
							  'ctx_relev': 0.8, # 0.6 (manual), 0.8
							  'beta': 0.1}) # 0.1
			rfile = get_results_file(query_set, s.name())
			get_search_metrics(queries, s, force=True, results_file=rfile)
			del s
Ejemplo n.º 6
0
def check_topics_effect(searcher, query_set) :

	queries = load_query_set(query_set, 10)
	get_search_metrics(queries, searcher, force=True)
Ejemplo n.º 7
0
        maps = []
        for query, pub_id, _year, actual_docs, _rels, _titles in queries:

            top = self.searcher.search(query,
                                       limit=20,
                                       exclude=set([pub_id]),
                                       force=False)
            maps.append(apk(actual_docs, top, k=20))

        return np.mean(maps)


if __name__ == '__main__':

    query_set = 'manual'
    queries = load_query_set(query_set, 30)

    se = SearchEvaluator(Searcher(**config.PARAMS))
    bocv = BayesianOptCV(
        estimator=se,
        param_bounds={
            'K': (5, 50),
            'papers_relev': (0.001, 1.0),
            'authors_relev': (0.001, 1.0),
            'venues_relev': (0.001, 1.0),
            'words_relev': (0.001, 1.0),
            'alpha': (0.1, 0.9),
            'age_relev': (0.001, 1.0),
            'query_relev': (0.001, 1.0),
            'ctx_relev': (0.01, 10.0)
        },