def main():
    import json
    import datetime
    import sys, os, io
    from geolocate_reverse import geolocate_reverse

    process_start = datetime.datetime.now()
    msg = "\n============================================="  + \
          "\nupdate augmented geo data                    "  + \
          "\nprocess start: %s"%process_start.strftime("%c") + \
          "\n=============================================\n"
    print msg
    sys.stdout.flush()
    
    total_lines           = 0    # how many total input lines?
    total_geo_tags        = 0    # how many geo tags did we end up with?
    orig_geo_tags         = 0    # how many geo tags did we already have?
    
    output_list           = []   # list of tweets to send to output file
    
    global first_sleep
    first_sleep       = True               # first time through, we write to a new file
    global total_written
    total_written     = 0                  # how many rows have we written to the output file
    
    file_counter      = 0                  # which one is this one?
    
    output_json_filename = "HTA_reversegeo.json"
    # read in the files one-by-one
    # ============================
    for input_filename in ["HTA_geotagged.json"]:
        file_counter+=1
        print "---Processing file %d %s"%(file_counter, input_filename)
        sys.stdout.flush()
        
        # check that the file exists
        if not os.path.isfile(input_filename):
            msg = "%s does not exist "%input_filename
            print msg
            sys.stdout.flush()
            continue
            
        # open the file and read it line-by-line
        # ======================================
        with open(input_filename, "r" ) as infile:
            file_lines = 0
            for line in infile:
                file_lines+=1
                total_lines+=1
                    
                # read a line of json
                try:
                    tweet = json.loads(str(line))
                except Exception, e:
                    print "\nat line %d of %s "%(file_lines, input_filename)
                    print repr(e)
                    print "line will not be included in the output file\n"
                    sys.stdout.flush()
                    continue
                    
                # create the empty field to be added to each record
                tweet["geo_reverse"] = {"country_code": "", 
                                           "country": "",
                                           "zipcode": "",
                                           "city": "",
                                           "state": "",
                                           "state_abbr": "",
                                           "areacode": "",
                                           "FIPS": "",
                                           "county": "",
                                           "Type": "",
                                           "Pop_2010": "",
                                           "Land_Sq_Mi": ""}   
                if tweet['geo']:
                    lat = tweet['geo']["coordinates"][0]
                    lon = tweet['geo']["coordinates"][1]
                    #print (lat,lon)
                    tweet["geo_reverse"] = geolocate_reverse((lat,lon))
                output_list.append(tweet)
                
                if total_lines%500 == 0:
                    process_output_file(output_list, output_json_filename)
                    output_list = []
                    right_now = datetime.datetime.now()
                    print "%s line %d of file %s"%(right_now.strftime("%c"), file_lines, input_filename)
                    sys.stdout.flush()
def main():
    import json
    import datetime
    import sys, os, io
    from geolocate_reverse import geolocate_reverse
    
    output_json_filename = "HTA_noduplicates.json"
    input_file_list      = ["HTA_reversegeo.json",  \
                            "HTA_reversegeo2.json", \
                            "HTA_reversegeo3.json", \
                            "HTA_reversegeo4.json"]


    process_start = datetime.datetime.now()
    msg = "\n============================================="  + \
          "\nremove duplicates                            "  + \
          "\nprocess start: %s"%process_start.strftime("%c") + \
          "\n=============================================\n"
    print msg
    sys.stdout.flush()
    
    input_lines           = 0     # how many total input lines?
    output_lines          = 0     # how many output lines?
    duplicates            = 0     # how many duplicates were found?
    coord_count           = 0     # how many coordinate fields did we process?
    id_set                = set() # keep track of unique id's
    output_list           = []    # list of lines to write
    global first_sleep
    first_sleep       = True               # first time through, we write to a new file
    global total_written
    total_written     = 0                  # how many rows have we written to the output file
        
    # read in the files one-by-one
    # ============================
    for input_filename in input_file_list:
        print "---Processing file %s"%input_filename
        sys.stdout.flush()
        
        # check that the file exists
        if not os.path.isfile(input_filename):
            msg = "%s does not exist "%input_filename
            print msg
            sys.stdout.flush()
            continue
            
        # open the file and read it line-by-line
        # ======================================
        with open(input_filename, "r" ) as infile:
            file_lines = 0
            for line in infile:
                input_lines+=1
                file_lines+=1
                    
                # read a line of json
                try:
                    tweet = json.loads(line)
                except Exception, e:
                    print "\nat line %d of %s "%(file_lines, input_filename)
                    print repr(e)
                    print "line will not be included in the output file\n"
                    sys.stdout.flush()
                    continue
                     
                # have we already seen this tweet's id?
                tweet_id = tweet["id"]
                if tweet_id in id_set:
                    duplicates+=1
                    continue
                id_set.add(tweet_id)

                # does this tweet have a coordinates field?
                # https://dev.twitter.com/docs/platform-objects/tweets
                # reverse geo using it, if it does
                if tweet['coordinates']:
                    coord_count+=1
                    lon = tweet['coordinates']["coordinates"][0]
                    lat = tweet['coordinates']["coordinates"][1]
                    tweet["geo_reverse"] = geolocate_reverse((lat,lon))
                    # print (lat,lon)
                    # print json.dumps(tweet["geo_reverse"],indent=4)
                    # print
                    
                # add to the output list
                output_list.append(tweet)
                output_lines+=1
                
                if output_lines%5000 == 0:
                    process_output_file(output_list, output_json_filename)
                    output_list = []
                    right_now = datetime.datetime.now()
                    print "%s line %d of file %s"%(right_now.strftime("%c"), file_lines, input_filename)
                    sys.stdout.flush()