Example #1
0
def extract_HTML_from_all_vocab_pages():
	"""
	Loops through all vocab pages and extracts HTML from each, saves to pickle
	Created list looks like this:
	[
		{'url':'http://www.manythings.org/vocabulary/lists/e/words.php?f=baseball_positions',
		'html':...
		},
	]
	"""
	vocab_pages_HTML = load_pickle('working/vocab_pages_HTML.p')
	vocab_page_urls = load_pickle('working/vocab_page_urls.p')
	for url in vocab_page_urls:
		already_pulled = [i['url'] for i in vocab_pages_HTML]
		if url not in already_pulled:			
			print "We have not pulled this url before"
			html = get_html(url)
			vocab_pages_HTML.append (
					{
						'url':url,
						'html':html
					}
				)
			save_pickle(vocab_pages_HTML, 'working/vocab_pages_HTML.p')
			wait_for_random_time(wait_base = 4, wait_rand_ceil=10)
		else:
			print "We have already pulled this url (%s)" %url
def play_with_related_articles(a=None, m=None):
    """
	Explore related articles in the terminal...
	NB: messy code!
	"""
    if a == None and m == None:
        a = general_functions.load_pickle('data/articles.p')
        m = general_functions.load_pickle(
            'data/articles_cosine_similarities.p')
    id_ = raw_input(
        'Enter initial article ID. Leave blank to start with early NSA. >> ')
    if id_ == '':
        id_ = 'world/2013/jun/09/nsa-prism-uk-government'
    future_or_past = raw_input('Look into the future (f) or past (p)? >> ')
    future_or_past = future_or_past[0].lower().strip()
    t = {'f': 'future_articles', 'p': 'past_articles'}[future_or_past]
    while True:
        id_out = given_article_id_calculate_top_related(id_, t, m, a)
        print chr(27) + "[2J"
        if not id_out:
            print "NO MORE ARTICLES!"
            sys.exit()
        print "==============\nCURRENT ARTICLE:"
        print a[id_]['headline']
        print a[id_]['standfirst']
        print a[id_]['date']
        print id_
        print
        print "==============\nRELATED ARTICLES:"
        counter = 0
        for x in id_out:
            counter += 1
            print
            print counter
            print a[x]['headline']
            print a[x]['standfirst']
            if future_or_past == 'f':
                earlier_later = 'later'
            else:
                earlier_later = 'earlier'
            print str((a[x]['date'] -
                       a[id_]['date']).days) + ' days ' + earlier_later
            print x
            print "\n-----------------"

        number = raw_input(
            'Which article (1, 2, 3, etc...) shall we follow next...? >> ')
        number = int(number) - 1
        id_ = id_out[number]
def play_with_related_articles(a=None, m=None):
	"""
	Explore related articles in the terminal...
	NB: messy code!
	"""
	if a==None and m==None:
		a=general_functions.load_pickle('data/articles.p')
		m=general_functions.load_pickle('data/articles_cosine_similarities.p')
	id_ = raw_input('Enter initial article ID. Leave blank to start with early NSA. >> ')
	if id_=='':
		id_ = 'world/2013/jun/09/nsa-prism-uk-government'
	future_or_past = raw_input('Look into the future (f) or past (p)? >> ')
	future_or_past = future_or_past[0].lower().strip()
	t = {'f':'future_articles', 'p':'past_articles'}[future_or_past]
	while True:
		id_out = given_article_id_calculate_top_related(id_, t, m, a)
		print chr(27) + "[2J"
		if not id_out:
			print "NO MORE ARTICLES!"
			sys.exit()		
		print "==============\nCURRENT ARTICLE:"
		print a[id_]['headline']
		print a[id_]['standfirst']
		print a[id_]['date']
		print id_
		print 
		print "==============\nRELATED ARTICLES:"
		counter = 0
		for x in id_out:
			counter += 1	
			print 				
			print counter
			print a[x]['headline']
			print a[x]['standfirst']
			if future_or_past=='f':
				earlier_later = 'later'
			else:
				earlier_later = 'earlier'
			print str((a[x]['date']-a[id_]['date']).days)+' days '+earlier_later
			print x
			print "\n-----------------"
			
		number = raw_input('Which article (1, 2, 3, etc...) shall we follow next...? >> ')
		number = int(number)-1
		id_ = id_out[number]
Example #4
0
import general_functions
from butterfly_main import *

# For testing purposes
if False:
	a=general_functions.load_pickle('data/articles_uk.p')
	m=general_functions.load_pickle('data/articles_uk_cosine_similarities.p')
	ids = [  (u'world/2014/jan/27/nsa-gchq-smartphone-app-angry-birds-personal-data', 82),
			 (u'world/2014/feb/02/david-miranda-detention-chilling-attack-journalism', 87),
			 (u'world/2014/feb/14/court-challenge-mass-surveillance', 99),
			 (u'world/2014/feb/18/merkel-phone-tapping-law-mi6-nigel-inkster', 103),
			 (u'world/2014/feb/19/david-miranda-detention-lawful-court-glenn-greenwald',
			  104),
			 (u'world/2014/feb/27/gchq-interception-storage-webcam-images-condemned', 112),
			 (u'world/2014/feb/27/gchq-insists-optic-nerve-program-legal-legislation-2000',
			  112),
			 (u'world/2014/feb/28/nsa-gchq-webcam-spy-program-senate-investigation', 113),
			 (u'world/2014/feb/27/gchq-nsa-webcam-images-internet-yahoo', 113),
			 (u'world/2014/mar/04/nsa-chief-keith-alexander-david-miranda', 117),
			 (u'world/2014/apr/11/journalists-nsa-guardian-polk-award-snowden', 155),
			 (u'world/2014/may/11/lack-oversight-nsa-menwith-hill', 185),
			 (u'world/2014/may/15/david-miranda-appeal-high-court-ruling-detention-heathrow',
			  189),
			 (u'world/2014/may/23/surveillance-claims-boston-college-tapes', 197),
			 (u'world/2014/jun/07/stephen-fry-denounces-uk-government-edward-snowden-nsa-revelations',
			  212),
			 (u'world/2014/jun/11/government-public-case-surveillance-state-theresa-may',
			  216),
			 (u'world/2014/jun/17/mass-surveillance-social-media-permitted-uk-law-charles-farr',
			  222),
			 (u'world/2014/jun/18/labour-merkel-nsa-phone-tapping-raf-croughton', 223),
def create_frozen_kmeans_lookup(articles,
                                cosine_similarity_matrix,
                                future_or_past,
                                days_ago_start=90,
                                days_ago_end=0,
                                incremental_add=True):
    """
	If we don't want to calculate kmeans clusters at run-time, we can calculate in advance
	This function carries this out
	For each article in cosine_similarity_matrix we calculate the top 2 or 3 articles to show via k-means
	We save everything to a pickle (which can later be 'butterzipped')
	Format is like this, i.e. each article in cosine_sim_matrix is a key with related articles as list:
	{
		'world/2014/nov...': ['world/2015/jan/...', 'world/2014/dec/...'],
		'world/2014/oct...': ['world/2015/feb/...', 'world/2014/dec/...'],
	}
	We can select the number of days to go back. We should always go back 90 (atleast 90!) so that older articles can 'find' any newer articles we have added in the 90 days after it. If incremental_add = False, number of days is set at 9999.
	"""
    # Ensure argument correct
    assert future_or_past == 'future_articles' or future_or_past == 'past_articles', 'future_or_past variable must be "future_articles" or "past_articles"'
    # Path at which we will save/load pickle:
    if future_or_past == 'future_articles':
        save_path = options.current_articles_path_frozen_kmeans_future
    elif future_or_past == 'past_articles':
        save_path = options.current_articles_path_frozen_kmeans_past

    # We will fill out a dictionary
    # If we aren't adding incrementally we start from afresh
    if not incremental_add:
        frozen_kmeans = {}
        days_ago_start = 9999
        days_ago_end = 0
        print "Incremental add is set to False. Creating a new pickle."
    # If we want to add incrementally, we load the existing pickle (if exists)
    else:
        frozen_kmeans = general_functions.load_pickle(save_path)
        if frozen_kmeans == None:
            print "WARNING: Creating new pickle - no file found at %s" % (
                save_path)
            frozen_kmeans = {}

    # Counters, ids_to_calculate (only articles newer than a certain date)
    counter = 0
    ids_to_calculate = [
        id_ for id_ in cosine_similarity_matrix
        if (datetime.datetime.today() -
            articles[id_]['date']).days <= days_ago_start
        if (datetime.datetime.today() -
            articles[id_]['date']).days >= days_ago_end
    ]
    total = len(ids_to_calculate)
    print "We will calculate top K-means results for %s articles (i.e. those between %s and %s days ago)" % (
        total, days_ago_start, days_ago_end)

    # Loop through all articles in cosine_similarity_matrix
    for article_id in ids_to_calculate:
        counter += 1
        # Find top ID for each cluster via K-means functionality
        ids = given_article_id_calculate_top_related(
            article_id=article_id,
            future_or_past=future_or_past,
            cosine_similarity_matrix=cosine_similarity_matrix,
            articles=articles)
        # Save these to our data structure
        frozen_kmeans[article_id] = ids
        print "Saved %s/%s [%s]" % (counter, total, article_id)

        if counter % 100 == 0:
            general_functions.save_pickle(data=frozen_kmeans,
                                          filename=save_path)

    general_functions.save_pickle(data=frozen_kmeans, filename=save_path)
def create_butterzip_files(use_scikit_learn=False):
    """"
	1) Creates articles_butterzip.p and articles_butterzip.py:
		... these only contain articles in the cosine_similarity_matrix
		... these only contain fields used in the butterfly viz (id, headline, strapline, image, date) 
		... these also contain frozen k-means results so we don't have to calculate live (TODO: only future articles included for now to save on memory)

	2) Creates cosine_similarity_matrix_butterzip.py, which is a copy of cosine_similarity_matrix but uses simpler IDs (from lookup). Also need to create the lookup both ways for this.


	"""

    print "Creating butterzip files..."

    # Load main articles and cosine similarites pickles created previously
    articles = general_functions.load_pickle(
        filename=options.current_articles_path)
    cosine_similarity_matrix = general_functions.load_pickle(
        filename=options.current_articles_path_cosine_similarites)
    frozen_kmeans_future = general_functions.load_pickle(
        filename=options.current_articles_path_frozen_kmeans_future)
    frozen_kmeans_past = general_functions.load_pickle(
        filename=options.current_articles_path_frozen_kmeans_past)

    # 1) Create new articles (articles_butterzip)
    articles_butterzip = {}
    for id_ in cosine_similarity_matrix:

        if id_ in frozen_kmeans_future:
            if use_scikit_learn:
                article_barebones = {
                    'headline': articles[id_]['headline'],
                    'standfirst': articles[id_]['standfirst'],
                    'date': articles[id_]['date'],
                    'thumbnail': articles[id_]['thumbnail'],
                    'tags': articles[id_]['tags'],
                }
            else:
                article_barebones = {
                    'headline': articles[id_]['headline'],
                    'standfirst': articles[id_]['standfirst'],
                    'date': articles[id_]['date'],
                    'thumbnail': articles[id_]['thumbnail'],
                    'f': frozen_kmeans_future[
                        id_]  # frozen kmeans - also add past articles on another line here under 'p' key (TODO)
                }
            articles_butterzip[id_] = article_barebones

        else:
            print "Can't find article %s in frozen_kmeans_future - why not?" % id_

    # Save this as a pickle and a file
    general_functions.save_pickle(
        data=articles_butterzip,
        filename=options.current_articles_path_butterzip)
    general_functions.write_to_python_module(
        data=articles_butterzip,
        variable_name='articles',
        filename='../flask/articles_butterzip.py')

    # 2) Create simpler ID version of cosine_similarity_matrix

    # We need to create lookups for both ways
    # i.e. guardianid_to_countid = {'world/2014/Nov...':1, 'world/2014/Dec...':2,}
    # i.e. countid_to_guardianid = {1:'world/2014/Nov...', 2:'world/2014/Dec...',}
    guardianid_to_countid = {}
    countid_to_guardianid = {}
    counter = 0
    for id_ in cosine_similarity_matrix:
        counter += 1
        guardianid_to_countid[id_] = counter
        countid_to_guardianid[counter] = id_

    general_functions.write_to_python_module(
        data=guardianid_to_countid,
        variable_name='guardianid_to_countid',
        filename='../flask/guardianid_to_countid.py')
    general_functions.write_to_python_module(
        data=countid_to_guardianid,
        variable_name='countid_to_guardianid',
        filename='../flask/countid_to_guardianid.py')

    # And now we create a cosine_similarity_matrix_butterzip (i.e. reduced cosine_similarity_matrix) using these IDs
    cosine_similarity_matrix_butterzip = {}
    for id_ in cosine_similarity_matrix:
        # Get list of future articles and past articles for this ID within c_s_m
        future_articles = cosine_similarity_matrix[id_]['future_articles']
        past_articles = cosine_similarity_matrix[id_]['past_articles']
        # Convert these guardianids to countids within the lists
        future_articles_butterzip = dict([
            (guardianid_to_countid[key], future_articles[key])
            for key in future_articles if key in guardianid_to_countid
        ])
        past_articles_butterzip = dict([
            (guardianid_to_countid[key], past_articles[key])
            for key in past_articles if key in guardianid_to_countid
        ])
        # Save entries to butterzipped new cosine_similarity_matrix dict
        # Use 'f' instead of 'future_articles'
        # Use 'p' instead of 'past_articles'
        cosine_similarity_matrix_butterzip[id_] = {
            'f': future_articles_butterzip,
            'p': past_articles_butterzip
        }

    general_functions.write_to_python_module(
        data=cosine_similarity_matrix_butterzip,
        variable_name='cosine_similarity_matrix',
        filename='../flask/cosine_similarity_matrix_butterzip.py')
def create_cosine_similarity_pickle_all_articles(threshold=0.3,
                                                 incremental_add=True,
                                                 fb_share_minimum=10):
    """
	Create cosine similarity matrix for all articles currently existing
	Should only need to be run once (we can then add to it incrementally)
	Pickle will be saved to disk.
	Form of matrix is:
	{'article1': 
		{'past_articles':
			{'article5':0.71,
			 'article14': 0.88,
			 ...
			 }
		},
		{'future_articles':
			{'article9':0.99,
			 'article29': 0.73,
			 ...
			 }
		},
	}
	"""
    if incremental_add == False:
        # Everything will be recalculated from afresh
        # Check we want to overwrite the main pickle (will take a long time...)
        print "WARNING: Are you sure you wish recalculate all cosine similarity values??"
        print "This will overwrite all previous values and may take a long time."
        print "We will use *%s* as the article set." % options.current_articles_path
        print "If not, set incremental_add to True: this will only calculate new cosine pairs."
        print "Press 'y' to continue..."
        # u = raw_input('>> ')
        u = 'y'
        if u != 'y':
            print "** CANCELLED **"
            return None
        print "Creating cosine similarity matrix using %s" % options.current_articles_path

        # Empty variable to fill
        cosine_similarity_matrix = {}  # To fill with whole matrix

    elif incremental_add == True:
        # We will only add the new articles and calculate pairs involving these
        cosine_similarity_matrix = general_functions.load_pickle(
            filename=options.current_articles_path_cosine_similarites
        )  # To add incremental articles to
        if cosine_similarity_matrix == None:
            print "-- WARNING: could not load similarity matrix. Does it exist already or do we need to switch off incremental mode (i.e. create new matrix)?\n"
            sys.exit()

    # Start timer
    t1 = time.time()

    # Load articles and start counter
    articles = general_functions.load_pickle(options.current_articles_path)
    total_number = len(articles)
    counter = 0
    print "We have %s articles to loop through." % total_number

    # Loop through all articles in main collection
    for id1 in articles:
        counter += 1
        print "Article %s/%s" % (counter, total_number)

        # Skip if article already in the cosine_similarity_matrix
        if id1 in cosine_similarity_matrix:
            print "Skipping article, already analysed"
            continue

        # Skip if Facebook share rate too low
        fb_shares = articles[id1]['facebook']['snapshot_of_total']
        try:
            if int(fb_shares) < fb_share_minimum:
                print "Skipping article, FB shares too low"
                continue
        except:
            print "Skipping article, FB shares not crawled yet"
            continue

        # Skip if article is not of type we like
        if article_type_is_bad(id1):
            print "Skipping article, bad type"
            continue

        # Load article
        article1 = articles[id1]
        article1_cosine_similarities = {
            'future_articles': {},
            'past_articles': {}
        }  # To fill with 1 row

        # Create vector (dict of counts) for the article
        article1_vector = create_vector_from_article(article1)
        if debug:
            print "================================"
            print "article1:"
            print "Headline:", article1['headline']
            print "Tags:", article1['tags']
            print "Vector created:", article1_vector

        # Loop through all second articles
        for id2 in articles:
            # Skip if same article
            if id1 == id2:
                continue
            # Skip if low FB shares
            fb_shares = articles[id2]['facebook']['snapshot_of_total']
            try:
                if int(fb_shares) < fb_share_minimum:
                    continue
            except:
                continue
            # Skip if bad article type
            if article_type_is_bad(id2):
                continue

            # Load article
            article2 = articles[id2]

            # Create vector (dict of counts) for the second article
            article2_vector = create_vector_from_article(article2)
            if debug:
                print ""
                print "-- article2:"
                print "-- Headline:", article2['headline']
                print "-- Tags:", article2['tags']
                print "-- Vector created:", article2_vector

            # Calculate crossover
            cosine = get_cosine(article1_vector, article2_vector)
            if debug:
                print "---- Cosine = %s" % cosine

            # Ignore second article if too low crossover
            if cosine < threshold:
                if debug:
                    print "---- Not storing, cosine too low"
                continue

            # Store in collection if crossover is over threshold
            # Stored as integer (/100) as saves 25% space
            date1, date2 = article1['date'], article2['date']
            future_or_past = compare_dates(date1, date2)
            cosine = int(cosine * 100)

            if future_or_past == 'future':
                article1_cosine_similarities['future_articles'][id2] = cosine
            elif future_or_past == 'past':
                article1_cosine_similarities['past_articles'][id2] = cosine

            # Also add to corresponding 'other' article if it exists in the matrix already
            # Note that past/future are swapped, as we are comparing swapped dates to the above
            if id2 in cosine_similarity_matrix:
                if future_or_past == 'future':
                    cosine_similarity_matrix[id2]['past_articles'][
                        id1] = cosine
                elif future_or_past == 'past':
                    cosine_similarity_matrix[id2]['future_articles'][
                        id1] = cosine

        # Now we have looped through all articles, we add the line for id1 to our overall matrix
        cosine_similarity_matrix[id1] = article1_cosine_similarities
        if debug:
            print "======\n====="
            print "THIS IS THE SIMILARITY MATRIX LINE"
            pprint.pprint(cosine_similarity_matrix)

        # Save the pickle every 100 additions
        if counter % 30 == 0:
            general_functions.save_pickle(
                data=cosine_similarity_matrix,
                filename=options.current_articles_path_cosine_similarites)

    # End for statement looping over id1. Save pickle.
    general_functions.save_pickle(
        data=cosine_similarity_matrix,
        filename=options.current_articles_path_cosine_similarites)

    # End timer
    t2 = time.time()
    print "TOTAL TIME:", t2 - t1
        incremental_add = True

    # Depending on status of incremental_add:
    # If False: Create a NEW cosine similarity dictionary and save as pickle for ALL articles
    # If True: Update the cosine similarity dictionary and save as pickle for incremental articles
    # Filter out articles with less than X facebook shares
    # Only save cosine similarities above Y threshold value
    if True:
        print "COSINE SIMILARITY CALCULATIONS:"
        create_cosine_similarity_pickle_all_articles(
            threshold=0.4, incremental_add=incremental_add, fb_share_minimum=5)

    # If we don't want to calculate the top K-means clusters at run-time, we can save the IDs beforehand by running this. Number of days to re-calculate should be 90 or above when calculating for 'future' articles, as we need to 'catch' new articles coming in within the historial articles' related lists.
    # If incremental add is false, we recreate the whole matrix (with number of days set at 9999)
    if True:
        print "KMEANS CLUSTERING CALCULATIONS:"
        create_frozen_kmeans_lookup(
            articles=general_functions.load_pickle(
                options.current_articles_path),
            cosine_similarity_matrix=general_functions.load_pickle(
                options.current_articles_path_cosine_similarites),
            future_or_past='future_articles',
            days_ago_start=180,  #should be 180 or so
            days_ago_end=0,  # should be 0
            incremental_add=incremental_add)

    # Create a smaller articles pickle, and python module, based on articles in cosine similarity dict, and only including relevant fields. Also create
    if True:
        print "COMPRESSING FILES (BUTTERZIP) FOR USE ON HEROKU"
        create_butterzip_files(use_scikit_learn=False)
def create_frozen_kmeans_lookup(articles,
		cosine_similarity_matrix,
		future_or_past,
		days_ago_start=90,
		days_ago_end=0,
		incremental_add=True):
	"""
	If we don't want to calculate kmeans clusters at run-time, we can calculate in advance
	This function carries this out
	For each article in cosine_similarity_matrix we calculate the top 2 or 3 articles to show via k-means
	We save everything to a pickle (which can later be 'butterzipped')
	Format is like this, i.e. each article in cosine_sim_matrix is a key with related articles as list:
	{
		'world/2014/nov...': ['world/2015/jan/...', 'world/2014/dec/...'],
		'world/2014/oct...': ['world/2015/feb/...', 'world/2014/dec/...'],
	}
	We can select the number of days to go back. We should always go back 90 (atleast 90!) so that older articles can 'find' any newer articles we have added in the 90 days after it. If incremental_add = False, number of days is set at 9999.
	"""
	# Ensure argument correct
	assert future_or_past=='future_articles' or future_or_past=='past_articles', 'future_or_past variable must be "future_articles" or "past_articles"'
	# Path at which we will save/load pickle:
	if future_or_past == 'future_articles':
		save_path = options.current_articles_path_frozen_kmeans_future
	elif future_or_past == 'past_articles':
		save_path = options.current_articles_path_frozen_kmeans_past
	
	# We will fill out a dictionary
	# If we aren't adding incrementally we start from afresh
	if not incremental_add:
		frozen_kmeans = {}
		days_ago_start = 9999
		days_ago_end = 0
		print "Incremental add is set to False. Creating a new pickle."
	# If we want to add incrementally, we load the existing pickle (if exists)
	else:
		frozen_kmeans = general_functions.load_pickle(save_path)
		if frozen_kmeans==None:
			print "WARNING: Creating new pickle - no file found at %s" %(save_path)
			frozen_kmeans = {}

	# Counters, ids_to_calculate (only articles newer than a certain date)
	counter = 0
	ids_to_calculate = [id_ for id_ in cosine_similarity_matrix
		if (datetime.datetime.today()-articles[id_]['date']).days <= days_ago_start
		if (datetime.datetime.today()-articles[id_]['date']).days >= days_ago_end ]	
	total = len(ids_to_calculate)
	print "We will calculate top K-means results for %s articles (i.e. those between %s and %s days ago)" %(total, days_ago_start, days_ago_end)


	# Loop through all articles in cosine_similarity_matrix
	for article_id in ids_to_calculate:
		counter +=1
		# Find top ID for each cluster via K-means functionality
		ids = given_article_id_calculate_top_related(
			article_id=article_id,
			future_or_past=future_or_past,
			cosine_similarity_matrix=cosine_similarity_matrix,
			articles=articles)
		# Save these to our data structure
		frozen_kmeans[article_id] = ids
		print "Saved %s/%s [%s]" %(counter, total, article_id)

		if counter%100==0:
			general_functions.save_pickle(data = frozen_kmeans,
				filename = save_path)

	general_functions.save_pickle(data = frozen_kmeans,
				filename = save_path)
def create_butterzip_files(use_scikit_learn=False):
	""""
	1) Creates articles_butterzip.p and articles_butterzip.py:
		... these only contain articles in the cosine_similarity_matrix
		... these only contain fields used in the butterfly viz (id, headline, strapline, image, date) 
		... these also contain frozen k-means results so we don't have to calculate live (TODO: only future articles included for now to save on memory)

	2) Creates cosine_similarity_matrix_butterzip.py, which is a copy of cosine_similarity_matrix but uses simpler IDs (from lookup). Also need to create the lookup both ways for this.


	"""

	print "Creating butterzip files..."

	# Load main articles and cosine similarites pickles created previously
	articles = general_functions.load_pickle(filename = options.current_articles_path)
	cosine_similarity_matrix = general_functions.load_pickle(filename = options.current_articles_path_cosine_similarites)
	frozen_kmeans_future = general_functions.load_pickle(filename = options.current_articles_path_frozen_kmeans_future)
	frozen_kmeans_past = general_functions.load_pickle(filename = options.current_articles_path_frozen_kmeans_past)

	# 1) Create new articles (articles_butterzip)
	articles_butterzip = {}
	for id_ in cosine_similarity_matrix:

		if id_ in frozen_kmeans_future:
			if use_scikit_learn:
				article_barebones = {
					'headline':articles[id_]['headline'],
					'standfirst':articles[id_]['standfirst'],
					'date':articles[id_]['date'],
					'thumbnail':articles[id_]['thumbnail'],
					'tags':articles[id_]['tags'],
				}
			else: 
				article_barebones = {
					'headline':articles[id_]['headline'],
					'standfirst':articles[id_]['standfirst'],
					'date':articles[id_]['date'],
					'thumbnail':articles[id_]['thumbnail'],
					'f':frozen_kmeans_future[id_]  # frozen kmeans - also add past articles on another line here under 'p' key (TODO)
				}
			articles_butterzip[id_] = article_barebones

		else:
			print "Can't find article %s in frozen_kmeans_future - why not?" %id_

	# Save this as a pickle and a file
	general_functions.save_pickle(
		data = articles_butterzip, 
		filename = options.current_articles_path_butterzip)
	general_functions.write_to_python_module(
		data = articles_butterzip,
		variable_name = 'articles',
		filename = '../flask/articles_butterzip.py')

	# 2) Create simpler ID version of cosine_similarity_matrix
	
	# We need to create lookups for both ways
	# i.e. guardianid_to_countid = {'world/2014/Nov...':1, 'world/2014/Dec...':2,}
	# i.e. countid_to_guardianid = {1:'world/2014/Nov...', 2:'world/2014/Dec...',}
	guardianid_to_countid = {}
	countid_to_guardianid = {}
	counter = 0
	for id_ in cosine_similarity_matrix:
		counter+=1
		guardianid_to_countid[id_]=counter
		countid_to_guardianid[counter]=id_

	general_functions.write_to_python_module(
		data = guardianid_to_countid,
		variable_name = 'guardianid_to_countid',
		filename = '../flask/guardianid_to_countid.py')	
	general_functions.write_to_python_module(
		data = countid_to_guardianid,
		variable_name = 'countid_to_guardianid',
		filename = '../flask/countid_to_guardianid.py')	

	# And now we create a cosine_similarity_matrix_butterzip (i.e. reduced cosine_similarity_matrix) using these IDs
	cosine_similarity_matrix_butterzip = {}
	for id_ in cosine_similarity_matrix:
		# Get list of future articles and past articles for this ID within c_s_m
		future_articles = cosine_similarity_matrix[id_]['future_articles']
		past_articles = cosine_similarity_matrix[id_]['past_articles']
		# Convert these guardianids to countids within the lists
		future_articles_butterzip = dict ( [ (guardianid_to_countid[key], future_articles[key]) for key in future_articles if key in guardianid_to_countid] )
		past_articles_butterzip = dict ( [ (guardianid_to_countid[key], past_articles[key]) for key in past_articles if key in guardianid_to_countid] )
		# Save entries to butterzipped new cosine_similarity_matrix dict
		# Use 'f' instead of 'future_articles'
		# Use 'p' instead of 'past_articles'
		cosine_similarity_matrix_butterzip[id_] = {
			'f':future_articles_butterzip,
			'p':past_articles_butterzip
		}

	general_functions.write_to_python_module(
		data = cosine_similarity_matrix_butterzip,
		variable_name = 'cosine_similarity_matrix',
		filename = '../flask/cosine_similarity_matrix_butterzip.py')
def create_cosine_similarity_pickle_all_articles(threshold=0.3, incremental_add=True, fb_share_minimum=10):
	"""
	Create cosine similarity matrix for all articles currently existing
	Should only need to be run once (we can then add to it incrementally)
	Pickle will be saved to disk.
	Form of matrix is:
	{'article1': 
		{'past_articles':
			{'article5':0.71,
			 'article14': 0.88,
			 ...
			 }
		},
		{'future_articles':
			{'article9':0.99,
			 'article29': 0.73,
			 ...
			 }
		},
	}
	"""
	if incremental_add == False:
		# Everything will be recalculated from afresh
		# Check we want to overwrite the main pickle (will take a long time...)
		print "WARNING: Are you sure you wish recalculate all cosine similarity values??"
		print "This will overwrite all previous values and may take a long time."
		print "We will use *%s* as the article set." %options.current_articles_path
		print "If not, set incremental_add to True: this will only calculate new cosine pairs."
		print "Press 'y' to continue..."
		# u = raw_input('>> ')
		u = 'y'
		if u!='y':
			print "** CANCELLED **"
			return None
		print "Creating cosine similarity matrix using %s" %options.current_articles_path

		# Empty variable to fill
		cosine_similarity_matrix = {}  # To fill with whole matrix

	elif incremental_add==True: 
		# We will only add the new articles and calculate pairs involving these
		cosine_similarity_matrix = general_functions.load_pickle(filename = options.current_articles_path_cosine_similarites)  # To add incremental articles to
		if cosine_similarity_matrix == None:
			print "-- WARNING: could not load similarity matrix. Does it exist already or do we need to switch off incremental mode (i.e. create new matrix)?\n"
			sys.exit()

	# Start timer
	t1 = time.time()

	# Load articles and start counter	
	articles = general_functions.load_pickle(options.current_articles_path)
	total_number = len(articles)
	counter = 0
	print "We have %s articles to loop through." %total_number

	# Loop through all articles in main collection	
	for id1 in articles:
		counter+=1
		print "Article %s/%s" %(counter, total_number)

		# Skip if article already in the cosine_similarity_matrix
		if id1 in cosine_similarity_matrix:
			print "Skipping article, already analysed"
			continue

		# Skip if Facebook share rate too low
		fb_shares = articles[id1]['facebook']['snapshot_of_total']
		try:
			if int(fb_shares)<fb_share_minimum:
				print "Skipping article, FB shares too low"
				continue
		except:
			print "Skipping article, FB shares not crawled yet"
			continue

		# Skip if article is not of type we like
		if article_type_is_bad(id1):
			print "Skipping article, bad type"
			continue

		# Load article
		article1 = articles[id1]
		article1_cosine_similarities = {'future_articles':{}, 'past_articles':{}}  # To fill with 1 row

		# Create vector (dict of counts) for the article
		article1_vector = create_vector_from_article(article1)
		if debug:			
			print "================================"
			print "article1:"
			print "Headline:", article1['headline']
			print "Tags:", article1['tags']		
			print "Vector created:", article1_vector

		# Loop through all second articles
		for id2 in articles:
			# Skip if same article
			if id1==id2:
				continue
			# Skip if low FB shares
			fb_shares = articles[id2]['facebook']['snapshot_of_total']
			try:
				if int(fb_shares)<fb_share_minimum:
					continue
			except:			
				continue
			# Skip if bad article type
			if article_type_is_bad(id2):
				continue

			# Load article
			article2 = articles[id2]

			# Create vector (dict of counts) for the second article
			article2_vector = create_vector_from_article(article2)			
			if debug:
				print ""
				print "-- article2:"
				print "-- Headline:", article2['headline']
				print "-- Tags:", article2['tags']		
				print "-- Vector created:", article2_vector

			# Calculate crossover
			cosine = get_cosine(article1_vector, article2_vector)
			if debug:
				print "---- Cosine = %s" %cosine
			
			# Ignore second article if too low crossover
			if cosine<threshold:				
				if debug:
					print "---- Not storing, cosine too low"
				continue

			# Store in collection if crossover is over threshold
			# Stored as integer (/100) as saves 25% space
			date1, date2 = article1['date'], article2['date']
			future_or_past = compare_dates(date1, date2)
			cosine = int(cosine*100)

			if future_or_past=='future':
				article1_cosine_similarities['future_articles'][id2] = cosine
			elif future_or_past=='past':
				article1_cosine_similarities['past_articles'][id2] = cosine

			# Also add to corresponding 'other' article if it exists in the matrix already
			# Note that past/future are swapped, as we are comparing swapped dates to the above
			if id2 in cosine_similarity_matrix:
				if future_or_past=='future':
					cosine_similarity_matrix[id2]['past_articles'][id1] = cosine
				elif future_or_past=='past':
					cosine_similarity_matrix[id2]['future_articles'][id1] = cosine

		# Now we have looped through all articles, we add the line for id1 to our overall matrix
		cosine_similarity_matrix[id1] = article1_cosine_similarities
		if debug:
			print "======\n====="
			print "THIS IS THE SIMILARITY MATRIX LINE"
			pprint.pprint(cosine_similarity_matrix)

		# Save the pickle every 100 additions
		if counter%30==0:
			general_functions.save_pickle(data = cosine_similarity_matrix, filename = options.current_articles_path_cosine_similarites)

	# End for statement looping over id1. Save pickle.
	general_functions.save_pickle(data = cosine_similarity_matrix, filename = options.current_articles_path_cosine_similarites)
	
	# End timer
	t2 = time.time()
	print "TOTAL TIME:", t2 - t1
	if sys.argv[1]=='fresh':
		incremental_add = False
	else:
		incremental_add = True

	# Depending on status of incremental_add:
	# If False: Create a NEW cosine similarity dictionary and save as pickle for ALL articles
	# If True: Update the cosine similarity dictionary and save as pickle for incremental articles
	# Filter out articles with less than X facebook shares
	# Only save cosine similarities above Y threshold value
	if True:
		print "COSINE SIMILARITY CALCULATIONS:"
		create_cosine_similarity_pickle_all_articles(threshold=0.4, incremental_add=incremental_add, fb_share_minimum=5)

	# If we don't want to calculate the top K-means clusters at run-time, we can save the IDs beforehand by running this. Number of days to re-calculate should be 90 or above when calculating for 'future' articles, as we need to 'catch' new articles coming in within the historial articles' related lists.
	# If incremental add is false, we recreate the whole matrix (with number of days set at 9999)
	if True:
		print "KMEANS CLUSTERING CALCULATIONS:"
		create_frozen_kmeans_lookup(
			articles=general_functions.load_pickle(options.current_articles_path),
			cosine_similarity_matrix=general_functions.load_pickle(options.current_articles_path_cosine_similarites),
			future_or_past='future_articles',
			days_ago_start=180, #should be 180 or so
			days_ago_end=0, # should be 0
			incremental_add=incremental_add)

	# Create a smaller articles pickle, and python module, based on articles in cosine similarity dict, and only including relevant fields. Also create 
	if True:
		print "COMPRESSING FILES (BUTTERZIP) FOR USE ON HEROKU"
		create_butterzip_files(use_scikit_learn=False)
Example #13
0
def extract_words_and_create_dicts():
	"""
	Extracts word lists and categories from all HTML pages
	Saves to appropriate formats
	"""
	choose_to_continue = raw_input("\nDo you really wish to overwrite existing lookups\n(which have been manually filtered)?\nYou should create a backup first.\nType 'done' to continue. >> ")
	if choose_to_continue != 'done':
		print "Not continuing."
		return 0
	from bs4 import BeautifulSoup
	vocab_pages_HTML = load_pickle('working/vocab_pages_HTML.p')
	gathered_data = []

	for page in vocab_pages_HTML:
		category = ""
		vocab_list = []
		
		# Create soup
		html = page['html']
		soup = BeautifulSoup(html)
		
		# Get category name
		category = soup.find('h2').text.split('\n')[0]

		# Get vocab lists
		ul_list = soup.findAll('ul')
		for ul in ul_list:
			li_list = ul.findAll('li')
			for li in li_list:
				vocab_list.append(li.text)

		gathered_data.append (
				{
					'url':page['url'],
					'category':category,
					'vocab_list':vocab_list
				}
			)

	# Now we have gathered all of the data, we can create the appropriate lookup tables
	word_lookup = {}
	category_lookup = {}

	total, counter = len(gathered_data), 0
	for page in gathered_data:
		counter += 1
		print page['vocab_list']
		print page['category']
		print "Number %s out of %s..." %(counter, total)
		user_input = raw_input("""
		Type:
			0 to discard
			1 to keep as is
			words (separate by commas) to add to list (animals,pets)
			R, followed by words (separate by commas) to remove from list (R,pig,cow)
		>> """	
		)
		# User wants to skip
		if user_input=='0':
			print "Skipped..."
			continue

		# User wants to add words
		if user_input!='1' and user_input[0]!='R':
			print "Adding extra words..."
			words_to_add = page['vocab_list'] + [x.strip() for x in user_input.split(',')]

		# User wants to remove words
		if user_input[0]=='R':
			print "Removing these words..."
			words_to_add = [x for x in page['vocab_list'] if x not in [y.strip() for y in user_input.split(',')]]
			
		# User wants to keep as is
		if user_input == '1':
			words_to_add = page['vocab_list']

		if user_input == 'QUIT':
			break

		if user_input == '':
			words_to_add = page['vocab_list']			

		# Now continue with the process, sorting out words
		# Create category lookup
		category_lookup[page['category']] = words_to_add
		
		# Create word lookup for each word
		# Create new entry for word, or add to entry
		for word in words_to_add:
			if word in word_lookup:
				word_lookup[word].append(page['category'])
			else:
				word_lookup[word]=[page['category']]

		save_pickle(word_lookup, 'created/find_category_given_word.p')
		save_pickle(category_lookup, 'created/find_word_given_category.p')
		print "---"
	

# Get all vocab page URLs once (10 seconds)
# get_vocab_page_urls()

# Get HTML for every vocab page (400 or so in total, takes a while)
# extract_HTML_from_all_vocab_pages()

# Create dictionaries based on crawled data (very quick, no more crawling is required for this)
# extract_words_and_create_dicts()
Example #14
0
from general_functions import load_pickle

general_collocations = load_pickle('created/collocations_dict.p')
stopwords = load_pickle('created/stopwords.p')
simple_noun_similarity_dict = load_pickle('created/sim_dict_trimmed_n.p')
find_category_given_word = load_pickle('created/find_category_given_word.p')
find_word_given_category = load_pickle('created/find_word_given_category.p')
common_rhyming_words = load_pickle('created/common_rhyming_words.p')
list_of_all_sounds = [
		'AA',
        'AE',
        'AH',
        'AO',
        'AW',
        'AY',
        'B',
        'CH',
        'D',
        'DH',
        'EH',
        'ER',
        'EY',
        'F',
        'G',
        'HH',
        'IH',
        'IY',
        'JH',
        'K',
        'L',
        'M',