def main(): country_geojson = {'type': 'FeatureCollection', 'features': []} place_geojson = {'type': 'FeatureCollection', 'features': []} country_features = {} place_features = {} for a in articles: locs = a['nytd_geo_facet'] for loc in locs: if loc in country_freq: if loc not in country_features: country_features[loc] = get_feature(loc) feat = country_features[loc] # add article and increment article count feat['properties']['articles'].append(a) feat['properties']['article_count'] += 1 elif loc in place_freq: if loc not in place_features: place_features[loc] = get_feature(loc) feat = place_features[loc] # add article and increment article count feat['properties']['articles'].append(a) feat['properties']['article_count'] += 1 else: print "could not place " + loc print type(country_features) country_geojson['features'] = [country_features[key] for key in country_features] place_geojson['features'] = [place_features[key] for key in place_features] jsonfiles.write('../json/output/countries_2012_v3.json', country_geojson) jsonfiles.write('../json/output/places_2012_v3.json', place_geojson)
def main(year_str): fields = jsonfiles.read("../json/fields.json") year = int(year_str) print 'Getting articles for the year ' + str(year) results = [] # make request once to get the number of pages in the response querystring = get_querystring(year, fields, 0) r = requests.get(querystring).json() results = results + r["results"] pages = r["total"] / 10 # make request for remaining pages # write out every 1000 requests i = 1 last_start = 1 for p in xrange(last_start, pages + 1): # try until request succeeds while True: try: querystring = get_querystring(year, fields, p) r = requests.get(querystring).json() results = results + r["results"] print ("request #%d - first title: %s") % (p, r["results"][0]["title"]) except ValueError as e: print e print "retrying ..." time.sleep(1.0) continue except KeyError as e: print e print 'Skipping ', str(p) break if p % 1000 == 0: print "set of 1000 - first title is:\n%s" % results[0]["title"] filename = "".join([ "../json/output/nyt_articles_", str(year), "_", str(i), ".json"]) jsonfiles.write(filename, results) i += 1 # reset results results = [] # sleep so we're not locked out of the API time.sleep(0.08) # write remaining results filename = "".join([ "../json/output/nyt_articles_", str(year), "_", str(i), ".json"]) jsonfiles.write(filename, results)
def main(year): locs_and_articles = jsonfiles.read('../json/output/geocoded_locs_%s.json' % year) coords = {} for loc in locs_and_articles: lat = locs_and_articles[loc]['lat'] lon = locs_and_articles[loc]['lon'] name = loc coords[loc] = {'lat': lat, 'lon': lon, 'name': name} jsonfiles.write('../json/output/place_to_coord_mappings_%s.json' % year, coords)
def write_output(filtered_articles, countries_geojson, places_geojson, year): # write to file try: # write out countries and the articles that correspond to them filename = "../json/output/countries_%s_v2.json" % year jsonfiles.write(filename, countries_geojson) # write out places and the articles that correspond to them filename = "../json/output/places_%s_v2.json" % year jsonfiles.write(filename, places_geojson) except IOError as e: print e
def main(argv): if len(argv) != 2 or int(argv[0]) < 1980: print "Invalid args: " + str(argv) return year = int(argv[0]) segments = int(argv[1]) print "Joining %d files for %d" % (segments, year) output = [] for i in xrange(1, segments + 1): filename = ("../json/output/nyt_articles_%s_%d.json" % (str(year), i)) r = jsonfiles.read(filename) print type(r) output = output + r print len(output) outfile_name = "../json/output/nyt_articles_%d_all.json" % year jsonfiles.write(outfile_name, output)
import csv import jsonfiles with open('../json/population_2010.csv', 'rb') as f: countries = {} lines = [] for row in csv.reader(f): if '2010' in row and 'High variant' in row: val = row[3] val = int(val.replace('.', '')) countries[row[0]] = val jsonfiles.write('../json/output/country_populations_2010.json', countries)
import jsonfiles articles = jsonfiles.read('../json/output/nyt_articles_2012_filtered.json') country_list = jsonfiles.read('../json/output/countries.json') countries = {} for a in articles: geo_facets = a['nytd_geo_facet'] for facet in geo_facets: if facet not in country_list: continue if facet not in countries: countries[facet] = {'articles': [], 'article_count': 0} countries[facet]['articles'].append(a['url']) countries[facet]['article_count'] += 1 jsonfiles.write('../json/output/articles_by_country.json', countries)
def main(year): if not USE_CACHED: print "Beginning ..." # load list of all articles filename = "../json/output/nyt_articles_%s_all.json" % str(year) all_articles = jsonfiles.read(filename) # filter out articles with no nytd_geo_facet property filtered_articles = get_geotagged(all_articles) # write out all articles with a geo_facet filename = "../json/output/nyt_articles_" + str(year) + "_filtered.json" jsonfiles.write(filename, filtered_articles) # get categorized dict of articles # { # "China": [{..}, {..} ... {..}], ... # ... # } locations = categorize(filtered_articles) # locations = categorize_from_local(filtered_articles) # write out articles that have been geocoded jsonfiles.write("../json/output/geocoded_locs_" + str(year) + ".json", locations) else: filtered_articles = jsonfiles.read("../json/output/nyt_articles_" + str(year) + "_filtered.json") locations = jsonfiles.read("../json/output/geocoded_locs_" + str(year) + ".json") # list of the countries in the world countries_dict = jsonfiles.read("../json/output/countries.json") # get the number of articles in each country in descending order counts = [(loc, len(locations[loc]["articles"])) for loc in locations if loc in countries_dict] descending = sorted(counts, key=lambda x: x[1]) descending.reverse() freq = {d[0]: d[1] for d in descending} jsonfiles.write("../json/output/article_freq_by_country.json", freq) # do the same for places counts = [(loc, len(locations[loc]["articles"])) for loc in locations if loc not in countries_dict] descending = sorted(counts, key=lambda x: x[1]) descending.reverse() freq = {d[0]: d[1] for d in descending} jsonfiles.write("../json/output/article_freq_by_place.json", freq) countries_geojson = {"type": "FeatureCollection", "features": []} places_geojson = {"type": "FeatureCollection", "features": []} g = geocoders.GoogleV3() for loc in locations: for article in locations[loc]["articles"]: feature = get_feature(article, locations[loc]) if not feature: continue if loc in countries_dict: countries_geojson["features"].append(feature) else: places_geojson["features"].append(feature) print ( "%d article matches for countries and %d matches for places" % (len(countries_geojson["features"]), len(places_geojson["features"])) )