Esempio n. 1
0
def unpackage(query):
    # get the sorted set of link tuples with tf-idf
    keywords_top = get_top(tl.get_tf_idf(query, "wikipedia"))
    #store the keyword tree as a dictionary, with key initial keyword and value sub_keywords
    keywords_dict = {}
    for keyword in keywords_top:
        all_sub = tl.get_tf_idf(keyword)
        keywords_dict[keyword] = get_top(all_sub)
    return keywords_dict
Esempio n. 2
0
def unpackage (query):
  # get the sorted set of link tuples with tf-idf
  keywords_top = get_top(tl.get_tf_idf(query, "wikipedia"))
  #store the keyword tree as a dictionary, with key initial keyword and value sub_keywords
  keywords_dict = {}
  for keyword in keywords_top:
    all_sub = tl.get_tf_idf(keyword)
    keywords_dict[keyword] = get_top(all_sub)
  return keywords_dict  
Esempio n. 3
0
#!/usr/bin/python

import tfidf
import process
import google
import pickle
import order
import operator
import sys

#defines a term to search for
term = ""

#gets the list of all words with associated tf-idf scores
words = tfidf.get_tf_idf(term, "wikipedia")

#gets the cleaned list after removing list elements with non-alphanumeric charachters
cleaned = tfidf.parse(words)
cleaned = tfidf.remove_duplicates(cleaned)

#gets the list of top words (keywords)
keywords = tfidf.get_top_words(10, cleaned)
print keywords
sys.exit(0)

#gets all combinations of the keywords; this is a list of tuples
combinations = process.get_combinations(keywords)

try:		
	urls = pickle.load(open ("dump/urls_" + term + ".p", "rb"))
except:
Esempio n. 4
0
#!/usr/bin/python

import tfidf
import process
import google
import pickle
import order
import operator

#defines a term to search for
term = "Biology"

#gets the list of all words with associated tf-idf scores
words = tfidf.get_tf_idf(term, "wikipedia")

#gets the cleaned list after removing list elements with non-alphanumeric charachters
cleaned = tfidf.parse(words)
cleaned = tfidf.remove_duplicates(cleaned)

#gets the list of top words (keywords)
keywords = tfidf.get_top_words(10, cleaned)

#gets all combinations of the keywords; this is a list of tuples
combinations = process.get_combinations(keywords)

try:
    urls = pickle.load(open("generator/dump/urls_" + term + ".p", "rb"))
except:
    #otherwise, creates and searches for the keywords
    print "Searching for each keyword set instead, and saving as pickle"
    urls = []
Esempio n. 5
0
def index(request):
    if 'query' in request.GET:
        term = request.GET['query']
        #replaces spaces with underscores
        term = term.replace(" ", "_")
        print error("****THE TERM IS " + term)
    else:
        #defines a term to search for
        term = "Manifold"

    #gets the list of all words with associated tf-idf scores
    words = tfidf.get_tf_idf(term, "wikipedia")

    #gets the cleaned list after removing list elements with non-alphanumeric charachters
    cleaned = tfidf.parse(words)
    cleaned = tfidf.remove_duplicates(cleaned)

    #gets the list of top words (keywords)
    keywords = tfidf.get_top_words(10, cleaned)
    print error(str(keywords))

    #gets all combinations of the keywords; this is a list of tuples
    combinations = process.get_combinations(keywords)

    try:
        urls = pickle.load(open("generator/dump/urls_" + term + ".p", "rb"))
    except:
        #otherwise, creates and searches for the keywords
        print "Searching for each keyword set instead, and saving as pickle"
        urls = []
        for (word1, word2) in combinations:
            print "Searching for " + word1 + ", " + word2 + "."
            try:
                results = google.search(term + " " + word1 + " " + word2,
                                        "com", "en", 1, 0, 3, 2.0)

                for i in range(3):
                    next_result = results.next()
                    if not next_result in urls:
                        urls.append(next_result)

            except:
                print "HTML request overload."
                break

        pickle.dump(urls, open("generator/dump/urls_" + term + ".p", "wb"))

    #gets the cleaned text for each url
    try:
        url_text_dictionary = pickle.load(
            open("generator/dump/url_text_dictionary_" + term + ".p", "rb"))
    except:
        url_text_dictionary = process.get_text_dictionary(urls)
        pickle.dump(
            url_text_dictionary,
            open("generator/dump/url_text_dictionary_" + term + ".p", "wb"))

    urls = url_text_dictionary.keys()
    pickle.dump(urls, open("generator/dump/urls_" + term + ".p", "wb"))

    #gets the titles for each url
    try:
        print error("Loading titles.")
        url_titles = pickle.load(
            open("generator/dump/url_titles_" + term + ".p", "rb"))
        print error("Loaded existing titles.")
    except:
        print error("Getting titles.")
        url_titles = process.get_titles(urls)
        pickle.dump(url_titles,
                    open("generator/dump/url_titles_" + term + ".p", "wb"))

    #print url_text_dictionary

    #now that we have the entire dictionary of url:text mappings, we can analyze instead
    wiki_text = process.get_text('http://en.wikipedia.org/wiki/' + term)

    #creates dictionaries for the summarability and order scores
    ordered_keywords = order.get_ordered_keywords(wiki_text, keywords)
    #print error(str(ordered_keywords))
    wiki_distribution = process.get_density(wiki_text, keywords)
    summarability = {}
    orderability = {}

    #calculates the summarability score in a dictionary
    for url, text in url_text_dictionary.iteritems():
        current_distribution = process.get_density(text, keywords)
        summarability[url] = order.get_difference(wiki_distribution,
                                                  current_distribution)
        orderability[url] = order.get_order(current_distribution,
                                            ordered_keywords)

    #print summarability
    print "\n\n"
    #print orderability

    #gets the urls sorted by summarability as a list of (url, score) tuples
    sorted_summarability = sorted(summarability.iteritems(),
                                  key=operator.itemgetter(1))
    sorted_orderability = sorted(orderability.iteritems(),
                                 key=operator.itemgetter(1))

    combined = order.combine_summarability_and_orderability(
        sorted_summarability, orderability, url_titles)

    print "Titles: " + str(url_titles)
    with_titles = []
    #adds in the title to the tuple
    for (url, score) in combined:
        with_titles.append((url, score, url_titles[url]))

    return render(request, 'generator/index.html',
                  ({
                      'summarability': json.dumps(sorted_summarability),
                      'orderability': json.dumps(sorted_orderability),
                      'combined': json.dumps(combined),
                      'with_titles': json.dumps(with_titles)
                  }))
Esempio n. 6
0
def index(request):
	if 'query' in request.GET:
		term = request.GET['query']
		#replaces spaces with underscores
		term = term.replace(" ", "_")
		print error("****THE TERM IS " + term)
	else:
		#defines a term to search for
		term = "Manifold"

	#gets the list of all words with associated tf-idf scores
	words = tfidf.get_tf_idf(term, "wikipedia")

	#gets the cleaned list after removing list elements with non-alphanumeric charachters
	cleaned = tfidf.parse(words)
	cleaned = tfidf.remove_duplicates(cleaned)

	#gets the list of top words (keywords)
	keywords = tfidf.get_top_words(10, cleaned)
	print error(str(keywords))

	#gets all combinations of the keywords; this is a list of tuples
	combinations = process.get_combinations(keywords)

	try:		
		urls = pickle.load(open ("generator/dump/urls_" + term + ".p", "rb"))
	except:
		#otherwise, creates and searches for the keywords
		print "Searching for each keyword set instead, and saving as pickle"
		urls = []
		for (word1, word2) in combinations:	
			print "Searching for " + word1 + ", " + word2 + "."
			try:
				results = google.search(term + " " + word1 + " " + word2, "com", "en", 1, 0, 3, 2.0)  			
				
				for i in range(3):
					next_result = results.next()
					if not next_result in urls:
						urls.append(next_result)

			except:
				print "HTML request overload."
				break

		pickle.dump(urls, open("generator/dump/urls_" + term + ".p", "wb"))	

	#gets the cleaned text for each url
	try:
		url_text_dictionary = pickle.load(open("generator/dump/url_text_dictionary_" + term + ".p", "rb"))
	except:	
		url_text_dictionary = process.get_text_dictionary(urls)
		pickle.dump(url_text_dictionary, open("generator/dump/url_text_dictionary_" + term + ".p", "wb"))

	urls = url_text_dictionary.keys()
	pickle.dump(urls, open("generator/dump/urls_" + term + ".p", "wb"))	

	#gets the titles for each url
	try:
		print error("Loading titles.")
		url_titles = pickle.load(open("generator/dump/url_titles_" + term + ".p", "rb"))
		print error("Loaded existing titles.")
	except:	
		print error("Getting titles.")
		url_titles = process.get_titles(urls)
		pickle.dump(url_titles, open("generator/dump/url_titles_" + term + ".p", "wb"))

	#print url_text_dictionary

	#now that we have the entire dictionary of url:text mappings, we can analyze instead
	wiki_text = process.get_text('http://en.wikipedia.org/wiki/' + term)

	#creates dictionaries for the summarability and order scores
	ordered_keywords = order.get_ordered_keywords(wiki_text, keywords)
	#print error(str(ordered_keywords))
	wiki_distribution = process.get_density(wiki_text, keywords)
	summarability = {}
	orderability = {}

	#calculates the summarability score in a dictionary
	for url, text in url_text_dictionary.iteritems():
		current_distribution = process.get_density(text,keywords)
		summarability[url] = order.get_difference(wiki_distribution, current_distribution)
		orderability[url] = order.get_order(current_distribution, ordered_keywords)

	#print summarability
	print "\n\n"
	#print orderability

	#gets the urls sorted by summarability as a list of (url, score) tuples
	sorted_summarability = sorted(summarability.iteritems(), key=operator.itemgetter(1))
	sorted_orderability = sorted(orderability.iteritems(), key=operator.itemgetter(1))
	
	combined = order.combine_summarability_and_orderability(sorted_summarability, orderability, url_titles)

	print "Titles: " + str(url_titles)
	with_titles = []
	#adds in the title to the tuple
	for (url, score) in combined:
		with_titles.append((url, score, url_titles[url]))

	return render(request, 'generator/index.html', ({'summarability': json.dumps(sorted_summarability), 'orderability': json.dumps(sorted_orderability), 'combined': json.dumps(combined), 'with_titles': json.dumps(with_titles)}))