Example #1
0
def method_new(name="Untitled Q-Method", owner="Your Name", email="email", phone='phone', notes=''):
    #look for existing methods:
    options = os.listdir(data_path)
    new_option = ""
    #make sure that:
    #a) we have a new id and
    #b) the new id has not already been used
    while (not new_option) or (new_option in options):
        new_option = generate_id()

    #make new directory in method_path
    method_path = os.path.join(data_path, new_option)
    if not os.path.exists(method_path):
        os.makedirs(method_path)
    else:
        #This should never happen with above while loop, but just in case...
        raise ValueError, "Path exists, but it shouldn't: %s" % method_path
    
    #make an empty configuration file
    config = os.path.join(method_path, "config.json")
    result = load_json(config, create=True)
    result['name'] = name
    result['owner'] = owner
    result['email'] = email
    result['phone'] = phone
    result['notes'] = notes
    result['statements'] = """1. First sample statement
2. Second sample statement"""
    result['columns'] = '2 3 5 6 8 6 5 3 2'

    save_json(config, result)
    
    #redirect to the new method's page:        
    redirect("/method/" + new_option + "/bookmark/")
Example #2
0
def get_champion_id_dict(SECRET_API_KEY=None):
    dct = helpers.load_json(".cache/champ_ids.json")
    if not dct or time.time() > dct["expiry_time"]:
        if not SECRET_API_KEY:
            SECRET_API_KEY = read_api_key()
        url = BASE + "/lol/static-data/v3/champions?locale=en_US&dataById=true"
        headers = {"X-Riot-Token": SECRET_API_KEY}
        r = requests.get(url, headers=headers)
        if r.status_code == 429:
            print(r.json())
            t = r.json()["Retry-After"]
            print("Waiting " + str(t) + " seconds and trying again")
            print("Full response:")
            print(r.json())
            time.sleep(t)
            return get_match_from_id(matchid, SECRET_API_KEY)
        if r.status_code != 200:
            print("Get champion id dict failed")
            return r
        dct = {}
        data = r.json()["data"]
        for champ_id in data:
            dct[champ_id] = data[champ_id]["name"]
        dct["expiry_time"] = time.time(
        ) + 60 * 60 * 24 * 30  # Expires after one month
        if not helpers.store_json(dct, ".cache/champ_ids.json", True):
            return None
        return dct
    else:
        return dct
Example #3
0
def update_json(source, city_tag):
    cache_file = "%s.json" % city_tag
    cache_destination = os.path.join(os.path.dirname(source), cache_file)

    local_cache = load_json(cache_destination, create=True)

    assert local_cache.has_key('buildings')
    assert local_cache.has_key('parcels')

    locations = {}
    for key, value in local_cache['buildings'].items():
        location = Location(value)

        for source in location.sources:
            if hasattr(location, source):
                result = getattr(location, source)
                #convert from old dict format here
                if isinstance(result, dict):
                    print "Found dictionary in: %s for: %s" % (
                        source, location.address)

                    result = [result]
                    setattr(location, source, result)

        locations[key] = location

    #back it up for later
    #enable this when downloading GPS coordinates...
    #the rest of the time it slows things down
    local_cache['buildings'] = {}
    for key, value in locations.items():
        local_cache['buildings'][key] = value.to_dict()
    save_json(cache_destination, local_cache)
Example #4
0
	def update_state_json(self, content):
		try:
			cstate = helpers.load_json( content)[ 'cluster']
			self.update_state( cstate)
		except (ValueError, KeyError):
			return False
		return True
Example #5
0
 def update_state_json(self, content):
     try:
         cstate = helpers.load_json(content)['cluster']
         self.update_state(cstate)
     except (ValueError, KeyError):
         return False
     return True
Example #6
0
def get_account_id(summoner_name, SECRET_API_KEY=None):
    dct = helpers.load_json(".cache/summoners/" + summoner_name + ".json")
    if not dct or time.time() > dct["expiry_time"]:
        if not SECRET_API_KEY:
            SECRET_API_KEY = read_api_key()
        if not summoner_name:
            raise TypeError("Summoner name cannot be None")
        if not isinstance(summoner_name, str):
            raise TypeError("Summoner name must be a string")
        url = BASE + "/lol/summoner/v3/summoners/by-name/" + summoner_name
        headers = {"X-Riot-Token": SECRET_API_KEY}
        r = requests.get(url, headers=headers)
        if r.status_code == 429:
            print(r.json())
            t = r.json()["Retry-After"]
            print("Waiting " + str(t) + " seconds and trying again")
            print("Full response:")
            print(r.json())
            time.sleep(t)
            return get_match_from_id(matchid, SECRET_API_KEY)
        if r.status_code != 200:
            print("Get account id failed")
            return r
        response = r.json()
        response["expiry_time"] = time.time(
        ) + 60 * 60 * 24 * 30  # Expires after one month
        helpers.store_json(response,
                           ".cache/summoners/" + summoner_name + ".json", True)
        return response["accountId"]
    else:
        return dct["accountId"]
def update_json(source, city_tag):
    cache_file = "%s.json" % city_tag
    cache_destination = os.path.join(os.path.dirname(source), cache_file)

    local_cache = load_json(cache_destination, create=True)

    assert local_cache.has_key('buildings')
    assert local_cache.has_key('parcels')

    locations = {}
    for key, value in local_cache['buildings'].items():
        location = Location(value)

        for source in location.sources:
            if hasattr(location, source):
                result = getattr(location, source)
                #convert from old dict format here
                if isinstance(result, dict):
                    print "Found dictionary in: %s for: %s" % (source, location.address)

                    result = [ result ]
                    setattr(location, source, result)

        locations[key] = location
        
    #back it up for later
    #enable this when downloading GPS coordinates...
    #the rest of the time it slows things down
    local_cache['buildings'] = {}
    for key, value in locations.items():
        local_cache['buildings'][key] = value.to_dict()
    save_json(cache_destination, local_cache)
Example #8
0
def get_match_from_id(matchid, SECRET_API_KEY=None):
    match = helpers.load_json(".match_cache/" + str(matchid) + ".json")
    if not match:
        if not SECRET_API_KEY:
            SECRET_API_KEY = read_api_key()
        if not matchid:
            raise TypeError("Match id cannot be None")
        if not isinstance(matchid, int):
            raise TypeError("Match id must be an int")
        url = BASE + "/lol/match/v3/matches/" + str(matchid)
        headers = {"X-Riot-Token": SECRET_API_KEY}
        r = requests.get(url, headers=headers)
        if r.status_code == 429:
            print("\n" * 5)
            print(r.json())
            print("\n" * 5)
            try:
                t = r.json()["Retry-After"]
            except:
                t = 180
            print("Waiting " + str(t) + " seconds and trying again")
            print("Full response:")
            print(r.json())
            sys.stdout.flush()
            time.sleep(t)
            return get_match_from_id(matchid, SECRET_API_KEY)
        if r.status_code != 200:
            print("Get match object failed")
            return r
        match = r.json()
        helpers.store_json(match, ".match_cache/" + str(matchid) + ".json",
                           True)
        return match
    else:
        return match
def check_resume_success(
    nlp, args, source_file, last_shard, output_path, split, compression
):
    logger.info("Checking if resume was successful...")
    chunk_file_path_str = split + "." + str(last_shard - 1) + ".json"
    if compression:
        chunk_file_path_str += ".gz"
    chunk_file_path = os.path.join(output_path, chunk_file_path_str)

    line_source = source_file.readline().strip()

    line_source_tokenized = next(tokenize(nlp, [line_source]))

    # Apply preprocessing on the line
    preprocessed_line = preprocess(
        line_source_tokenized,
        [1] * len(line_source_tokenized),
        args.min_sentence_ntokens,
        args.max_sentence_ntokens,
        args.min_example_nsents,
        args.max_example_nsents,
    )[0]

    try:
        chunk_json, _ = load_json(chunk_file_path)
    except FileNotFoundError:
        logger.error(
            "The file at path %s was not found. Make sure `--compression` is set correctly.",
            chunk_file_path,
        )
    last_item_chunk = chunk_json[-1]
    line_chunk = last_item_chunk["src"]

    # remove the last item if it is a newline
    if line_chunk[-1] == ["\n"]:
        line_chunk.pop()

    if line_chunk == preprocessed_line:
        logger.info("Resume Successful!")
        logger.debug("`source_file` moved forward one line")
    else:
        logger.info("Resume NOT Successful")
        logger.info("Last Chunk Line: %s", line_chunk)
        logger.info("Previous (to resume line) Source Line: %s", preprocessed_line)
        # skipcq: PYL-W1201
        logger.info(
            (
                "Common causes of this issue:\n"
                + "1. You changed the `--shard_interval`. You used a different interval previously than you used in the command to resume.\n"
                + "2. The abstractive (`.source` and `.target`) or extractive (`.json`) dataset files were modified or removed. The last `.json` file needs to be in the same folder it was originally outputted to so the last shard index and be determined and the last line can be read.\n"
                + "3. It is entirely possible that there is a bug in this script. If you have checked that the above were not the cause and that there were no issues pertaining to your dataset then open an issue at https://github.com/HHousen/TransformerSum/issues/new."
            )
        )
        return False

    return True
Example #10
0
def test_mingrate_pids(app, location, datadir):
    """Test migrate pids."""
    data = load_json(datadir, 'cds_records_demo_1_project.json')
    dump = CDSRecordDump(data=data[0])
    record = CDSRecordDumpLoader.create(dump=dump)

    pids = [pid.pid_value for pid in
            PersistentIdentifier.query.filter_by(object_uuid=record.id)]
    expected = sorted(['2093596', 'CERN-MOVIE-2012-193'])
    assert sorted(pids) == expected
Example #11
0
def _load_from_file( filename):
	try:
		with open( filename, 'r') as f:
			content = helpers.load_json( f.read())
	except (IOError, ValueError, KeyError) as e:
		_logger.warn( 'Cannot read file %s with json configuration %s', filename, e)
		return False

	r = _load_from_content( content)
	if r:
		_logger.info( 'Loading %s', filename)
	return r
Example #12
0
def configure():
    #look for existing methods:
    options = os.listdir(data_path)
    results = {}
    for option in options:
        method_path = os.path.join(data_path, option)
        config = os.path.join(method_path, "config.json")
        if os.path.exists(config):
            result = load_json(config)
            results[option] = result
        
    return template('configure', options=results)
Example #13
0
def _load_from_file(filename):
    try:
        with open(filename, 'r') as f:
            content = helpers.load_json(f.read())
    except (IOError, ValueError, KeyError) as e:
        _logger.warn('Cannot read file %s with json configuration %s',
                     filename, e)
        return False

    r = _load_from_content(content)
    if r:
        _logger.info('Loading %s', filename)
    return r
Example #14
0
def subject_new(key):
    """
    create a new subject for the Q-Method specified by key
    """
    method_path = os.path.join(data_path, key)
    if not os.path.exists(method_path):
        return template('404', key=key, item="method")
    else:
        #look for existing subjects:
        options = os.listdir(method_path)
        new_option = ""
        #make sure that:
        #a) we have a new id and
        #b) the new id has not already been used
        while (not new_option) or (new_option in options):
            new_option = generate_id()

        #make new directory in method_path
        subject_path = os.path.join(method_path, new_option)
        if not os.path.exists(subject_path):
            os.makedirs(subject_path)
        else:
            #This should not ever happen with above check, but just in case...
            raise ValueError, "Subject path exists, but it shouldn't: %s" % subject_path

        #make an empty configuration file
        config = os.path.join(subject_path, "subject_config.json")
        result = load_json(config, create=True)
        #once the subject starts sorting, we will cache this locally
        #based on the current state of the method configuration
        #result['statements'] = ""
        result['columns'] = u""
        result['json'] = u""
        result['started'] = u""
        #a textual representation of where each statement is
        result['state'] = u""
        result['history'] = u""
        #is it finished? complete? this will prevent further changes:
        result['locked'] = False
        #now:
        now = datetime.now()
        result['created'] = now.strftime("%Y.%m.%d %H:%M:%S")

        # after first movement
        result['started'] = u""
        result['last_update'] = u""

        save_json(config, result)

        #redirect to the new method's page:        
        redirect("/method/" + key + "/")
Example #15
0
def main():

    data = helpers.load_json("data/states.json")

    if not isinstance(data, dict):
        data = { x["full_name"]:x for x in data }


    key = "marriage_age"

    new = {}
    lines = helpers.read_lines("entry.txt")
    lines = [ x for x in lines if x ]
#    lines = lines[::4]

    for line in lines:


        line = line.split(". ")[-1]
        name, num = line.split(": ", 1)
        new[name] = float(num)



        try:
            name = line.split("\t")[0]
            name = name.split("(")[0].strip()
            new[name] = float(line.split("\t")[1].replace(",", ""))
        except Exception:
            pass

    [ print(k, ":", v) for k, v in new.items() ]

    for name, val in new.items():
        if name not in data:
            data[name] = {}
        data[name][key] = val

    # Clean up the data
    cleaned = {}
    for k, v in data.items():
        key = rmchars(k, ".")
        key = key.replace("Saint", "St")
        if key in cleaned:
            cleaned[key].update(v)
        else:
            cleaned[key] = v
        cleaned[key]["name"] = key

    return helpers.dump_json(cleaned, "foo.json")
def bulk_generation():
    filenames_test=load_json('filenames_test_saifullah')
    path='../../../../Desktop/UAV/images/'
    num_test_images=len(filenames_test)
#    generated_captions=list()
    f=open('generated_captions_saifullah.txt','w')
    for i in range(num_test_images):
        if i==884:
#            image 884 (square_40) is corrupted
            C=generate_caption(path+filenames_test[883])
            f.writelines(C+'\n')
            continue
        C=generate_caption(path+filenames_test[i])
#        generated_captions.append(C)
        f.writelines(C+'\n')
        progress=100*i/num_test_images
        print(i, "Progress: %.2f" % progress)
    f.close()
Example #17
0
def post_method_json(key=None):
    #print dir(request.forms)
    #print request.forms.keys()
    
    method_path = os.path.join(data_path, key)
    if not os.path.exists(method_path):
        return template('404', key=key, item="method")
    else:
        config = os.path.join(method_path, "config.json")
        result = load_json(config)

        changed = False
        for key in request.forms.keys():
            #special case for 'statements' key...
            #want to get rid of any extra newline characters
            #this will help calculate the number of statements more accurately
            #(rather than stripping newlines everywhere we look at statements)
            #
            #this works here, but it will make it difficult to provide
            #feedback to the user about how many statements there are
            #compared to how many spaces there are available in columns
            #adding a similar check in method.js
            if key == "statements":
                text = request.forms.get(key)
                lines = text.splitlines()
                new_lines = []
                for line in lines:
                    if line:
                        new_lines.append(line)
                value = '\n'.join(new_lines)
            else:
                value = request.forms.get(key)
            
            if value != result[key]:
                #print "%s (original) != %s (new)" % (result[key], request.forms.get(key))

                result[key] = value
                changed = True

        if changed:
            #print "METHOD CONFIG CHANGED!!!! (saving)"
            save_json(config, result)
            
        return template('success')
Example #18
0
def get_matches_for_tcode(tcode):
	path = ".cache/tournament_matches/" + tcode + ".json"
	try:
		matches = helpers.load_json(path)
	except:
		print("Failed to load path")
		print(path)
	miss = True
	if matches is not None:
		file_creation = os.path.getmtime(path)
		expiry_time = 3600 * 24 * 2
		miss = (matches == [] and time.time() > file_creation + expiry_time)
	if miss:
		print("Waiting due to cache miss")
		sys.stdout.flush()
		time.sleep(2)
		
		endpoint = "/lol/match/v3/matches/by-tournament-code/"+str(tcode)+"/ids"
		api_key = fetch_api_key()
		headers = {"X-Riot-Token": api_key}
		url = M_BASE + endpoint
		r = requests.get(url, headers=headers)
		if r.status_code == 404:
			print("Tcode " + str(tcode) + " does not have any games associated with it.")
			matches = []
			helpers.store_json(matches, path, True)
			return matches
		elif r.status_code == 429:
			print("Hit a retry-after when getting tournament matches. Exiting")
			exit(0)
		elif r.status_code != 200:
			print("Failed to get matches for tcode " + str(tcode))
			print(r)
			return r
		matches = []
		for match_id in r.json():
			match = get_tournament_match(match_id, tcode)
			if match:
				matches.append(match)
		helpers.store_json(matches, path, True)
	return matches
Example #19
0
def get_recent_history(accountid, SECRET_API_KEY=None):
    hist = helpers.load_json(".cache/recent_histories/" + str(accountid) +
                             ".json")
    if not hist or time.time() > hist["expiry_time"]:
        if not SECRET_API_KEY:
            SECRET_API_KEY = read_api_key()
        if not accountid:
            raise TypeError("Account id cannot be None")
        if not isinstance(accountid, int):
            raise TypeError("Account id must be an int")
        url = BASE + "/lol/match/v3/matchlists/by-account/" + str(
            accountid) + "/recent"
        headers = {"X-Riot-Token": SECRET_API_KEY}
        r = requests.get(url, headers=headers)
        if r.status_code == 429:
            print(r.json())
            t = r.json()["Retry-After"]
            print("Waiting " + str(t) + " seconds and trying again")
            print("Full response:")
            print(r.json())
            time.sleep(t)
            return get_match_from_id(matchid, SECRET_API_KEY)
        if r.status_code != 200:
            print("Get match history failed")
            return r
        response = r.json()
        if "matches" not in response:
            print("There is no recent match history")
            return []
        hist = {
            "expiry_time": time.time() * 60 * 60 * 2,
            "matches": response["matches"]
        }
        helpers.store_json(hist, ".cache/recent_histories/" + str(accountid) +
                           ".json", True)  # Expires after 2 hours
        return hist["matches"]
    else:
        return hist["matches"]
Example #20
0
def test_subformat_creation_if_missing(api_app, location, datadir, es, users):
    """Test subformat creation if missing."""
    # [[ migrate the video ]]
    migration_streams = get_migration_streams(datadir=datadir)
    data = load_json(datadir, 'cds_records_demo_1_video.json')
    dump = CDSRecordDump(data=data[0])
    with mock.patch.object(DataCiteProvider, 'register'), \
            mock.patch.object(CDSRecordDumpLoader, '_create_frame',
                              side_effect=get_frames), \
            mock.patch.object(ExtractFramesTask, '_create_gif'), \
            mock.patch.object(CDSRecordDumpLoader, '_clean_file_list'), \
            mock.patch.object(
                CDSRecordDumpLoader, '_get_migration_file_stream_and_size',
                side_effect=migration_streams):
        video = CDSRecordDumpLoader.create(dump=dump)
    db.session.commit()

    with mock.patch.object(TranscodeVideoTask, 'run') as mock_transcode:
        deposit = deposit_video_resolver(video['_deposit']['id'])
        deposit_id = deposit.id
        # simulate the missing of a subformat
        del deposit['_files'][0]['subformat'][0]
        assert len(deposit['_files'][0]['subformat']) == 4
        #  recreate 240p format
        CDSRecordDumpLoader._create_missing_subformats(
            record=video, deposit=deposit)
        db.session.commit()
        # check subformats
        deposit = Video.get_record(deposit_id)
        rec_video = record_resolver.resolve(video['recid'])[1]
        #  rec_video = record_resolver.resolve(video['recid'])[1]
        assert len(deposit['_files'][0]['subformat']) == 5
        assert len(rec_video['_files'][0]['subformat']) == 5
        # check if transcoding is called properly
        assert mock_transcode.called is True
        [(_, call_args)] = mock_transcode.call_args_list
        assert call_args == {'preset_quality': '240p'}
def read_csv(source_csv):
    city_options = City.objects.filter(tag="bloomington_in")
    print "Number of cities available: %s" % len(city_options)
    if not len(city_options):
        raise ValueError, "CITY NOT FOUND! run make_cities.py first"
        ## city = City()
        ## city.name = "Bloomington"
        ## city.tag = to_tag(city.name)
        ## city.save()
    else:
        city = city_options[0]

    print city

    feed_date = "2013-08-29"

    feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date)
    if feeds.exists():
        feed = feeds[0]
        print "Already had feed: %s, %s" % (feed.city, feed.added)
    else:
        feed = FeedInfo()
        feed.city = city
        feed.added = feed_date
        feed.version = "0.1"
        feed.save()
        print "Created new feed: %s" % feed.city

    people = Person.objects.filter(name="Blank")
    if people.exists():
        person = people[0]
        print "Already had person: %s" % (person.name)
    else:
        person = Person()
        person.name = "Blank"
        person.save()
        print "Created new person: %s" % person.name

    sources = Source.objects.filter(feed=feed)
    if sources.exists():
        feed_source = sources[0]
        print "Already had source: %s, %s" % (feed_source.feed.city, feed_source.feed.added)
    else:
        feed_source = Source()
        feed_source.feed = feed
        feed_source.person = person
        feed_source.save()
        print "Created new source: %s" % feed_source.feed.city

    cache_file = "%s.json" % city.tag
    cache_destination = os.path.join(os.path.dirname(source_csv), cache_file)
    # keep a local copy of data we've processed...
    # this should help with subsequent calls
    # to make sure we don't need to duplicate calls to remote geolocation APIs:
    local_cache = load_json(cache_destination, create=True)
    if not local_cache.has_key("buildings"):
        local_cache["buildings"] = {}
    if not local_cache.has_key("parcels"):
        local_cache["parcels"] = {}

    locations = {}
    for key, value in local_cache["buildings"].items():
        locations[key] = Location(value)

    # geocoder helper:
    geo = Geo()

    skips = 0
    with codecs.open(source_csv, "rb", encoding="utf-8") as csvfile:
        # reader = csv.reader(csvfile, delimiter=' ', quotechar='|')
        reader = csv.reader(csvfile)

        # just print the first row:
        print ">, <".join(reader.next())

        count = 0
        for row in reader:
            count += 1
            print "Looking at row: %s" % count

            # could exit out early here, if needed
            if count > 1000:
                # exit()
                pass

            bldg_id = row[0]
            print bldg_id

            address = row[1]
            print address

            owner = row[2]

            # skip this:
            ownder_contact = row[3]

            agent = row[4]

            bldg_units = row[9]
            print bldg_units

            units_bdrms = row[10]
            print units_bdrms

            # check if this is one we want to skip
            if conversions.has_key(address.upper()):
                address = conversions[address.upper()]

            # make sure it's not one we're skipping:
            if not address:
                print "SKIPPING ITEM: %s" % row[1]
                skips += 1
            else:
                if locations.has_key(address.upper()):
                    location = locations[address.upper()]
                else:
                    location = Location()

                # temporarily just want to look at google again
                location.sources = ["google"]

                # do some geocoding, as needed:
                search = "%s, Bloomington IN" % address.upper()

                any_updated = False
                for geo_source in location.sources:
                    update = geo.lookup(search, geo_source, location, force=True)
                    if update:
                        any_updated = True

                location.sources = ["google", "bing", "usgeo", "geonames", "openmq", "mq"]

                if not hasattr(location, "address_alt") or not location.address_alt:
                    any_updated = True

                location.address_alt = search
                location.bldg_units = bldg_units
                location.units_bdrms = units_bdrms
                locations[address.upper()] = location

                # handle the database storage
                bldg = make_building(location, bldg_id, city, feed_source)

                # owner_details = parse_person(owner)
                if owner:
                    result = special_cases(owner)
                    if result:
                        (owner_name, owner_address) = result
                    else:
                        (owner_name, owner_address, owner_phone, remainder) = parse_person(owner)
                        ## print "owner name: %s" % owner_name
                        ## print "owner address: %s" % owner_address
                        ## print ""

                        if owner_name:
                            (person, bldg_person) = make_person(owner_name, bldg, "Owner", address=owner_address)

                if agent and agent != "No Agent":
                    # agent_details = parse_person(agent)
                    (agent_name, agent_address, agent_phone, remainder) = parse_person(agent)
                    ## print "agent name: %s" % agent_name
                    ## print "agent address: %s" % agent_address
                    ## print ""

                    if agent_name:
                        (person, bldg_person) = make_person(agent_name, bldg, "Agent", address=agent_address, city=city)

                if any_updated:
                    # back it up for later
                    # enable this when downloading GPS coordinates...
                    # the rest of the time it slows things down
                    local_cache["buildings"] = {}
                    for key, value in locations.items():
                        local_cache["buildings"][key] = value.to_dict()
                    save_json(cache_destination, local_cache)

                print

    save_results(locations, "bloomington-filtered.tsv")
Example #22
0
from nltk import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

decoder_model.load_weights(
    'best_models/InceptionV3_5layers/1_checkpoint.keras')

image_dir = '../../Desktop/parsingDataset/RSICD_images/'

inception_tv_train = np.load(
    'image_features/transfer_values/InceptionV3/transfer_values_train.npy')
inception_tv_test = np.load(
    'image_features/transfer_values/InceptionV3/transfer_values_test.npy')

captions_train = load_json('captions_train')


def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()


chencherry = SmoothingFunction()


def bleu(reference, candidate, grade=1):
    reference_tokenized = word_tokenize(reference)
    reference_list = list()
    reference_list.append(reference_tokenized)
def read_csv(source):
    #for reading unicode
    #f = codecs.open(source, 'r', encoding='utf-8')

    city_options = City.objects.filter(tag="ann_arbor")
    print len(city_options)
    if not len(city_options):
        city = City()
        city.name = "Ann Arbor"
        city.tag = to_tag(city.name)
        city.save()
    else:
        city = city_options[0]

    print city

    #TODO:
    #setup FeedInfo item
    #and also create a Source item

    permit_sub_types = []
    status_types = []
    building_nums = []
    applicants = []
    managers = []

    cache_file = "%s.json" % city.tag
    cache_destination = os.path.join(os.path.dirname(source), cache_file)
    #keep a local copy of data we've processed...
    #this should help with subsequent calls
    #to make sure we don't need to duplicate calls to remote geolocation APIs:
    local_cache = load_json(cache_destination, create=True)
    if not local_cache.has_key('buildings'):
        local_cache['buildings'] = {}
    if not local_cache.has_key('parcels'):
        local_cache['parcels'] = {}

    locations = {}
    for key, value in local_cache['buildings'].items():
        locations[key] = Location(value)

    #geocoder helper:
    geo = Geo()

    #with open('eggs.csv', 'rb') as csvfile:
    with codecs.open(source, 'rb', encoding='utf-8') as csvfile:
        #reader = csv.reader(csvfile, delimiter=' ', quotechar='|')
        reader = csv.reader(csvfile)

        #just print the first row:
        print '>, <'.join(reader.next())

        count = 0
        for row in reader:
            count += 1
            #could exit out early here, if needed
            if count > 10:
                pass

            print row

            #type of building (eg: sf attached, duplex, etc)
            permit_id = row[0]

            #should always be "RENTAL" (don't need to track this one)
            permit_type = row[1]
            if not permit_type == "RENTAL" and not permit_type == "MECHANICAL":
                raise ValueError, "Unexpected permit type: %s in row: %s" % (
                    permit_type, row)

            sub_type = row[2]

            #can use this to filter out non-rental or obsolete entries
            #don't need to track otherwise:
            status = row[3]
            parcel_id = row[4]
            address = row[5]

            #should be fixed per source:
            city = row[6]
            if not ((city.lower() == 'ann arbor') or (city == '')):
                raise ValueError, "Unexpected city: %s" % (city)

            sqft = row[7]
            number_of_buildings = row[8]
            applicant_name = row[9]
            number_of_stories = row[10]
            number_of_units = row[11]

            if (not status in ['EXPIRED', 'CLOSED']) and (permit_type
                                                          in ['RENTAL']):
                #check if we've started processing any results for this row
                #if local_cache['buildings'].has_key(address.upper()):
                #    local_cache_cur = local_cache['buildings'][address.upper()]
                #else:
                #    local_cache_cur = {}

                if locations.has_key(address.upper()):
                    location = locations[address.upper()]
                else:
                    location = Location()

                #do some geocoding, as needed:
                search = "%s, Ann Arbor MI" % address.upper()

                for source in location.sources:
                    geo.lookup(search, source, location)

                location.address_alt = search

                locations[address.upper()] = location

                #local_cache['buildings'][address.upper()] = local_cache_cur

                #and check if a previous building object in the db exists
                #CREATE A NEW BUILDING OBJECT HERE
                #cur_building = Building()
                bldg = Building()
                bldg.type = sub_type

            #back it up for later
            local_cache['buildings'] = {}
            for key, value in locations.items():
                local_cache['buildings'][key] = value.to_dict()

            save_json(cache_destination, local_cache)
            #exit()

            #THE FOLLOWING ARE FOR INFORMATIONAL PURPOSES ONLY
            #(to see what data is available)

            if not status in status_types:
                #print "adding: %s" % sub_type
                status_types.append(status)

            if not sub_type in permit_sub_types:
                #print "adding: %s" % sub_type
                permit_sub_types.append(sub_type)

            building_num = row[8]
            if not building_num in building_nums:
                #print "adding: %s" % sub_type
                building_nums.append(building_num)

            applicant = row[9]
            if (re.search('MGMT', applicant) or re.search('REALTY', applicant)
                    or re.search('PROPERTIES', applicant)
                    or re.search('MANAGEMENT', applicant)
                    or re.search('GROUP', applicant)
                    or re.search('LLC', applicant)
                    or re.search('L.L.C.', applicant)
                    or re.search('INC', applicant)):
                if not applicant in managers:
                    managers.append(applicant)
            else:
                if not applicant in applicants:
                    applicants.append(applicant)

            #print ', '.join(row)
            #print

    ## print permit_sub_types
    print status_types
    print building_nums

    save_results(locations)
def read_csv(source):
    #for reading unicode
    #f = codecs.open(source, 'r', encoding='utf-8')

    city_options = City.objects.filter(tag="ann_arbor")
    print len(city_options)
    if not len(city_options):
        city = City()
        city.name = "Ann Arbor"
        city.tag = to_tag(city.name)
        city.save()
    else:
        city = city_options[0]

    print city

    #TODO:
    #setup FeedInfo item
    #and also create a Source item

    permit_sub_types = []
    status_types = []
    building_nums = []
    applicants = []
    managers = []

    cache_file = "%s.json" % city.tag
    cache_destination = os.path.join(os.path.dirname(source), cache_file)
    #keep a local copy of data we've processed...
    #this should help with subsequent calls
    #to make sure we don't need to duplicate calls to remote geolocation APIs:
    local_cache = load_json(cache_destination, create=True)
    if not local_cache.has_key('buildings'):
        local_cache['buildings'] = {}
    if not local_cache.has_key('parcels'):
        local_cache['parcels'] = {}
    
    locations = {}
    for key, value in local_cache['buildings'].items():
        locations[key] = Location(value)
    

    #geocoder helper:
    geo = Geo()
    
    #with open('eggs.csv', 'rb') as csvfile:
    with codecs.open(source, 'rb', encoding='utf-8') as csvfile:
        #reader = csv.reader(csvfile, delimiter=' ', quotechar='|')
        reader = csv.reader(csvfile)

        #just print the first row:
        print '>, <'.join(reader.next())

        count = 0
        for row in reader:
            count += 1
            #could exit out early here, if needed
            if count > 10:
                pass

            print row
            
            #type of building (eg: sf attached, duplex, etc)
            permit_id = row[0]

            #should always be "RENTAL" (don't need to track this one)
            permit_type = row[1]
            if not permit_type == "RENTAL" and not permit_type == "MECHANICAL":
                raise ValueError, "Unexpected permit type: %s in row: %s" % (
                    permit_type, row)
            
            sub_type = row[2]
            
            #can use this to filter out non-rental or obsolete entries
            #don't need to track otherwise:
            status = row[3]
            parcel_id = row[4]
            address = row[5]

            #should be fixed per source:
            city = row[6]
            if not ( (city.lower() == 'ann arbor') or (city == '') ):
                raise ValueError, "Unexpected city: %s" % (city)

            sqft = row[7]
            number_of_buildings = row[8]
            applicant_name = row[9]
            number_of_stories = row[10]
            number_of_units = row[11]
            
            if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']):
                #check if we've started processing any results for this row
                #if local_cache['buildings'].has_key(address.upper()):
                #    local_cache_cur = local_cache['buildings'][address.upper()]
                #else:
                #    local_cache_cur = {}

                if locations.has_key(address.upper()):
                    location = locations[address.upper()]
                else:
                    location = Location()

                #do some geocoding, as needed:
                search = "%s, Ann Arbor MI" % address.upper()

                for source in location.sources:
                    geo.lookup(search, source, location)

                location.address_alt = search

                locations[address.upper()] = location

                #local_cache['buildings'][address.upper()] = local_cache_cur
                

                #and check if a previous building object in the db exists
                #CREATE A NEW BUILDING OBJECT HERE
                #cur_building = Building()
                bldg = Building()
                bldg.type = sub_type
                


            #back it up for later
            local_cache['buildings'] = {}
            for key, value in locations.items():
                local_cache['buildings'][key] = value.to_dict()
            
            save_json(cache_destination, local_cache)
            #exit()

            #THE FOLLOWING ARE FOR INFORMATIONAL PURPOSES ONLY
            #(to see what data is available)

            if not status in status_types:
                #print "adding: %s" % sub_type
                status_types.append(status)


            if not sub_type in permit_sub_types:
                #print "adding: %s" % sub_type
                permit_sub_types.append(sub_type)

            building_num = row[8]
            if not building_num in building_nums:
                #print "adding: %s" % sub_type
                building_nums.append(building_num)


            applicant = row[9]
            if ( re.search('MGMT', applicant) or
                 re.search('REALTY', applicant) or 
                 re.search('PROPERTIES', applicant) or 
                 re.search('MANAGEMENT', applicant) or 
                 re.search('GROUP', applicant) or 
                 re.search('LLC', applicant) or 
                 re.search('L.L.C.', applicant) or 
                 re.search('INC', applicant)
                 ):
                if not applicant in managers:
                    managers.append(applicant)
            else:
                if not applicant in applicants:
                    applicants.append(applicant)
            
            

            #print ', '.join(row)
            #print

    ## print permit_sub_types
    print status_types
    print building_nums

    save_results(locations)
def read_csv(source_csv, city_name, city_tag):
    city_options = City.objects.filter(tag=city_tag)
    print "Number of cities available: %s" % len(city_options)
    if not len(city_options):
        raise ValueError, "CITY NOT FOUND! run make_cities.py first"
        ## city = City()
        ## city.name = city_name
        ## city.tag = to_tag(city.name)
        ## city.save()
    else:
        city = city_options[0]

    print city

    feed_date = "2013-07-31"

    feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date)
    if feeds.exists():
        feed = feeds[0]
        print "Already had feed: %s, %s" % (feed.city, feed.added)
    else:
        feed = FeedInfo()
        feed.city = city
        feed.added = feed_date
        feed.version = "0.1"
        feed.save()
        print "Created new feed: %s" % feed.city.name

    people = Person.objects.filter(name="Blank")
    if people.exists():
        person = people[0]
        print "Already had person: %s" % (person.name)
    else:
        person = Person()
        person.name = "Blank"
        person.save()
        print "Created new person: %s" % person.name

    sources = Source.objects.filter(feed=feed)
    if sources.exists():
        feed_source = sources[0]
        print "Already had source: %s, %s" % (feed_source.feed.city, feed_source.feed.added)
    else:
        feed_source = Source()
        feed_source.feed = feed
        feed_source.person = person
        feed_source.save()
        print "Created new source: %s" % feed_source.feed.city.name


    cache_file = "%s.json" % city.tag
    cache_destination = os.path.join(os.path.dirname(source_csv), cache_file)
    #keep a local copy of data we've processed...
    #this should help with subsequent calls
    #to make sure we don't need to duplicate calls to remote geolocation APIs:
    local_cache = load_json(cache_destination, create=True)
    if not local_cache.has_key('buildings'):
        local_cache['buildings'] = {}
    if not local_cache.has_key('parcels'):
        local_cache['parcels'] = {}
    
    locations = {}
    for key, value in local_cache['buildings'].items():
        locations[key] = Location(value)

    #geocoder helper:
    geo = Geo()

    skips = 0
    #with codecs.open(source_csv, 'rb', encoding='utf-8') as csvfile:
    with open(source_csv) as csvfile:
        #reader = csv.reader(csvfile, delimiter=' ', quotechar='|')
        #reader = csv.reader(csvfile)
        #reader = unicodecsv.UnicodeReader(csvfile, encoding='utf-8')

        reader = unicode_csv_reader(csvfile)

        #just print the first row:
        print '>, <'.join(reader.next())

        count = 0
        for row in reader:
            count += 1
            print "Looking at row: %s" % count
            
            #could exit out early here, if needed
            if count > 1000:
                #exit()
                pass
            
            address = row[0]

            #need to fix the number being at the end of the address
            parts = address.split(',')
            anumber = parts[-1]
            parts = parts[:-1]
            street = ",".join(parts)
            address = "%s %s" % (anumber, street)


            invoice_number = row[1]
            bldg_id = row[1]
            print bldg_id

            #this is where owner is stored
            invoice_note = row[6]
            print invoice_note
            if re.match('Sent to:', invoice_note):
                print "changing invoice note from: %s" % invoice_note
                invoice_note = invoice_note[8:]
                print "to: %s" % invoice_note
            else:
                #raise ValueError, "invoice note does not start with Sent to"
                print "!!!!!invoice note does not start with Sent to!!!!!"
                print ""
                print ""

            no_units = row[12]
            
            ## #should always be "RENTAL" (don't need to track this one)
            ## permit_type = row[1]
            ## if not permit_type == "RENTAL" and not permit_type == "MECHANICAL":
            ##     raise ValueError, "Unexpected permit type: %s in row: %s" % (
            ##         permit_type, row)
            
            ## bldg_type = row[2]
            
            ## #can use this to filter out non-rental or obsolete entries
            ## #don't need to track otherwise:
            ## status = row[3]
            ## parcel_id = row[4]

            ## #should be fixed per source:
            ## ss_city = row[6]

            ## bldg_sf = row[7]
            ## no_bldgs = row[8]
            ## applicant_name = row[9]
            ## no_stories = row[10]
            ## no_units = row[11]

            ## if not ( (ss_city.lower() == city_name.lower()) or (ss_city == '') ):
            ##     raise ValueError, "Unexpected city: %s" % (ss_city)

            ## sqft = row[7]
            ## number_of_buildings = row[8]
            ## applicant_name = row[9]
            ## number_of_stories = row[10]
            ## number_of_units = row[11]
            
            #check if this is one we want to skip
            if conversions.has_key(address.upper()):
                address = conversions[address.upper()]

            ## if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']):

            #make sure it's not one we're skipping:
            if not address:
                print "SKIPPING ITEM: %s" % row[1]
                skips += 1
            else:
                #check if we've started processing any results for this row
                if locations.has_key(address.upper()):
                    location = locations[address.upper()]
                else:
                    location = Location()

            #temporarily just want to look at google again
            #location.sources = ["google"]
            #location.sources = ["google", "bing", "usgeo", "geonames", "openmq"]
            location.sources = ["google", "bing"]

            #do some geocoding, as needed:
            search = "%s, %s, %s" % (address.upper(), city_name, city.state)

            any_updated = False
            for geo_source in location.sources:
                update = geo.lookup(search, geo_source, location, force=True)
                #update = geo.lookup(search, geo_source, location, force=False)
                if update:
                    any_updated = True

            location.sources = ["google", "bing", "usgeo", "geonames", "openmq", "mq"]

            #this is the case for brand new searches
            #(which are updated in a different sense)
            if not hasattr(location, "address_alt") or not location.address_alt:
                any_updated = True

            location.address_alt = search
            #location.bldg_units = bldg_units
            #location.units_bdrms = units_bdrms
            locations[address.upper()] = location

            #handle the database storage
            bldg = make_building(location, bldg_id, city, feed_source, no_units=no_units)

            if invoice_note:
                (person, bldg_person) = make_person(invoice_note, bldg, "Permit Applicant")

            if any_updated:
                #back it up for later
                #enable this when downloading GPS coordinates...
                #the rest of the time it slows things down
                local_cache['buildings'] = {}
                for key, value in locations.items():
                    local_cache['buildings'][key] = value.to_dict()
                save_json(cache_destination, local_cache)

            print

    destination = '%s.tsv' % city_tag
    save_results(locations, destination)
def read_csv(source_csv, city_name, city_tag):
    city_options = City.objects.filter(tag=city_tag)
    print "Number of cities available: %s" % len(city_options)
    if not len(city_options):
        raise ValueError, "CITY NOT FOUND! run make_cities.py first"
        ## city = City()
        ## city.name = city_name
        ## city.tag = to_tag(city.name)
        ## city.save()
    else:
        city = city_options[0]

    print city

    feed_date = "2013-07-31"

    feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date)
    if feeds.exists():
        feed = feeds[0]
        print "Already had feed: %s, %s" % (feed.city, feed.added)
    else:
        feed = FeedInfo()
        feed.city = city
        feed.added = feed_date
        feed.version = "0.1"
        feed.save()
        print "Created new feed: %s" % feed.city.name

    people = Person.objects.filter(name="Blank")
    if people.exists():
        person = people[0]
        print "Already had person: %s" % (person.name)
    else:
        person = Person()
        person.name = "Blank"
        person.save()
        print "Created new person: %s" % person.name

    sources = Source.objects.filter(feed=feed)
    if sources.exists():
        feed_source = sources[0]
        print "Already had source: %s, %s" % (feed_source.feed.city,
                                              feed_source.feed.added)
    else:
        feed_source = Source()
        feed_source.feed = feed
        feed_source.person = person
        feed_source.save()
        print "Created new source: %s" % feed_source.feed.city.name

    cache_file = "%s.json" % city.tag
    cache_destination = os.path.join(os.path.dirname(source_csv), cache_file)
    #keep a local copy of data we've processed...
    #this should help with subsequent calls
    #to make sure we don't need to duplicate calls to remote geolocation APIs:
    local_cache = load_json(cache_destination, create=True)
    if not local_cache.has_key('buildings'):
        local_cache['buildings'] = {}
    if not local_cache.has_key('parcels'):
        local_cache['parcels'] = {}

    locations = {}
    for key, value in local_cache['buildings'].items():
        locations[key] = Location(value)

    #geocoder helper:
    geo = Geo()

    skips = 0
    #with codecs.open(source_csv, 'rb', encoding='utf-8') as csvfile:
    with open(source_csv) as csvfile:
        #reader = csv.reader(csvfile, delimiter=' ', quotechar='|')
        #reader = csv.reader(csvfile)
        #reader = unicodecsv.UnicodeReader(csvfile, encoding='utf-8')

        reader = unicode_csv_reader(csvfile)

        #just print the first row:
        print '>, <'.join(reader.next())

        count = 0
        for row in reader:
            count += 1
            print "Looking at row: %s" % count

            #could exit out early here, if needed
            if count > 1000:
                #exit()
                pass

            address = row[0]

            #need to fix the number being at the end of the address
            parts = address.split(',')
            anumber = parts[-1]
            parts = parts[:-1]
            street = ",".join(parts)
            address = "%s %s" % (anumber, street)

            invoice_number = row[1]
            bldg_id = row[1]
            print bldg_id

            #this is where owner is stored
            invoice_note = row[6]
            print invoice_note
            if re.match('Sent to:', invoice_note):
                print "changing invoice note from: %s" % invoice_note
                invoice_note = invoice_note[8:]
                print "to: %s" % invoice_note
            else:
                #raise ValueError, "invoice note does not start with Sent to"
                print "!!!!!invoice note does not start with Sent to!!!!!"
                print ""
                print ""

            no_units = row[12]

            ## #should always be "RENTAL" (don't need to track this one)
            ## permit_type = row[1]
            ## if not permit_type == "RENTAL" and not permit_type == "MECHANICAL":
            ##     raise ValueError, "Unexpected permit type: %s in row: %s" % (
            ##         permit_type, row)

            ## bldg_type = row[2]

            ## #can use this to filter out non-rental or obsolete entries
            ## #don't need to track otherwise:
            ## status = row[3]
            ## parcel_id = row[4]

            ## #should be fixed per source:
            ## ss_city = row[6]

            ## bldg_sf = row[7]
            ## no_bldgs = row[8]
            ## applicant_name = row[9]
            ## no_stories = row[10]
            ## no_units = row[11]

            ## if not ( (ss_city.lower() == city_name.lower()) or (ss_city == '') ):
            ##     raise ValueError, "Unexpected city: %s" % (ss_city)

            ## sqft = row[7]
            ## number_of_buildings = row[8]
            ## applicant_name = row[9]
            ## number_of_stories = row[10]
            ## number_of_units = row[11]

            #check if this is one we want to skip
            if conversions.has_key(address.upper()):
                address = conversions[address.upper()]

            ## if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']):

            #make sure it's not one we're skipping:
            if not address:
                print "SKIPPING ITEM: %s" % row[1]
                skips += 1
            else:
                #check if we've started processing any results for this row
                if locations.has_key(address.upper()):
                    location = locations[address.upper()]
                else:
                    location = Location()

            #temporarily just want to look at google again
            #location.sources = ["google"]
            #location.sources = ["google", "bing", "usgeo", "geonames", "openmq"]
            location.sources = ["google", "bing"]

            #do some geocoding, as needed:
            search = "%s, %s, %s" % (address.upper(), city_name, city.state)

            any_updated = False
            for geo_source in location.sources:
                update = geo.lookup(search, geo_source, location, force=True)
                #update = geo.lookup(search, geo_source, location, force=False)
                if update:
                    any_updated = True

            location.sources = [
                "google", "bing", "usgeo", "geonames", "openmq", "mq"
            ]

            #this is the case for brand new searches
            #(which are updated in a different sense)
            if not hasattr(location,
                           "address_alt") or not location.address_alt:
                any_updated = True

            location.address_alt = search
            #location.bldg_units = bldg_units
            #location.units_bdrms = units_bdrms
            locations[address.upper()] = location

            #handle the database storage
            bldg = make_building(location,
                                 bldg_id,
                                 city,
                                 feed_source,
                                 no_units=no_units)

            if invoice_note:
                (person, bldg_person) = make_person(invoice_note, bldg,
                                                    "Permit Applicant")

            if any_updated:
                #back it up for later
                #enable this when downloading GPS coordinates...
                #the rest of the time it slows things down
                local_cache['buildings'] = {}
                for key, value in locations.items():
                    local_cache['buildings'][key] = value.to_dict()
                save_json(cache_destination, local_cache)

            print

    destination = '%s.tsv' % city_tag
    save_results(locations, destination)
Example #27
0
    def input_data(self):
        if not self._input_data:
            self._input_data = load_json(self._inputfile)

        return self._input_data
Example #28
0
def create_table_command(json_file='table_columns.json',
                         table_name='congress_bills'):
    col_data = load_json(json_file)
    ct = CreateTable(table_name, col_data)
    return ct.parse_to_create_table_command()
def read_csv(source_csv, city_tag, feed_date):
    #could also use city.models.find_by_city_state
    city_options = City.objects.filter(tag=city_tag)
    #print "Number of cities available: %s" % len(city_options)
    if not len(city_options):
        raise ValueError, "CITY NOT FOUND! run make_cities.py first"
    else:
        city = city_options[0]

    print city


    feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date)
    if feeds.exists():
        feed = feeds[0]
        print "Already had feed: %s, %s" % (feed.city, feed.added)
    else:
        feed = FeedInfo()
        feed.city = city
        feed.added = feed_date
        feed.version = "0.1"
        feed.save()
        print "Created new feed: %s" % feed.city.name

    people = Person.objects.filter(name="Blank")
    if people.exists():
        person = people[0]
        print "Already had person: %s" % (person.name)
    else:
        person = Person()
        person.name = "Blank"
        person.save()
        print "Created new person: %s" % person.name

    sources = Source.objects.filter(feed=feed)
    if sources.exists():
        feed_source = sources[0]
        print "Already had source: %s, %s" % (feed_source.feed.city, feed_source.feed.added)
    else:
        feed_source = Source()
        feed_source.feed = feed
        feed_source.person = person
        feed_source.save()
        print "Created new source: %s" % feed_source.feed.city.name


    # ideally, should be able to use the database itself as the cache,
    # instead of using a local file
    # but it's also good to not have to repeat geo queries if going in bulk
    # the site code *will* make geo queries
    # so it's still a good idea to cache the coded address locally
    # even if using the site code for everything else.
    
    cache_file = "%s.json" % city.tag
    #print cache_file
    cache_destination = os.path.join(os.path.dirname(source_csv), cache_file)
    print cache_destination
    #keep a local copy of data we've processed...
    #this should help with subsequent calls
    #to make sure we don't need to duplicate calls to remote geolocation APIs:
    loaded_cache = load_json(cache_destination, create=True)

    #need to go through and load SearchResults separately
    local_cache = {}
    for key in loaded_cache.keys():
        #this is useful if there is a cached value 
        #that was not parsed correctly... this will remove it:
        #if key.strip() == "314 North Washington Street Apt. C":
        if key.strip() == "some address with bad cached data":
            print "not adding: ", key
            #exit()
            pass
        else:
            current = loaded_cache[key]
            results = current['results']
            #print results
            sr = SearchResults()
            #sr.from_dict(results, debug=True)
            sr.from_dict(results, debug=False)
            #print sr
            current['results'] = sr

            #print current['results']
            local_cache[key] = current
        
    #use street address as the key
    #for each address, store SearchResults object

    #reset skips for every run:
    skips = codecs.open("skips.txt", 'w', encoding='utf-8')
    skips.close()


    skips = 0
    #with codecs.open(source_csv, 'rb', encoding='utf-8') as csvfile:
    with open(source_csv) as csvfile:

        #reader = csv.reader(csvfile, delimiter=' ', quotechar='|')
        #reader = csv.reader(csvfile)
        #reader = unicodecsv.UnicodeReader(csvfile, encoding='utf-8')
        reader = unicode_csv_reader(csvfile)

        #just print the first row:
        print '>, <'.join(reader.next())
        print

        keys = []
        for item in reader.next():
            key = item.lower().strip()
            key = key.replace('(', '')
            key = key.replace(')', '')
            key = key.replace('-', '_')
            key = key.replace('.', '')
            key = key.replace('/ ', '')
            key = key.replace('/', '_')
            key = key.replace('"', '')
            key = key.replace('#', 'num')
            key = key.replace(' ', '_')
            keys.append(key)
        
        #*and* the second row in this case
        print '>, <'.join(keys)

        #currently:
        #<street_address>, <unit_if_applicable>, <unit_type>, <rent>, <security_deposit>, <sq_feet_per_unit>, <num_bedrooms>, <num_bathrooms>, <maximum_occupancy_per_unit>, <lease_period>, <availability>, <laundry>, <parking>, <air_conditioning>, <pets>, <gym_fitness_center>, <game_room_rec_center_community_center>, <pool>, <other_amenities>, <bike_friendly>, <recycling>, <composting>, <gardening>, <public_transit>, <walk_friendly>, <other_smartliving_features>, <who_pays_for_electricity>, <who_pays_for_natural_gas>, <who_pays_for_water>, <who_pays_for_trash_recycling_pickup>, <who_pays_for_telephone_land_line>, <who_pays_for_cable>, <who_pays_for_internet>, <electricity_provider>, <electric_utility_cost_average_per_mo>, <electric_utility_cost_low>, <electric_utility_cost_high>, <natural_gas_provider>, <natural_gas_utility_cost_average_per_mo>, <natural_gas_utility_cost_low>, <natural_gas_utility_cost_high>, <energy_saving_features>, <utility_info_source>, <agent_property_manager>, <property_website_url>, <agent_property_manager_address>, <agent_property_manager_phone>, <owner>, <comments>

        #exit()

        count = 0
        #start = 6439
        start = 0

        #if you want to randomize the order... to distribute options more evenly
        #just do this in the original spreadsheet.
        #in order to randomize, should randomize the order in the csv
        for row in reader:

            current = {}
            count += 1
            print "Looking at row: %s" % count
            
            #could exit out early here, if needed (for testing)
            if count > 7220:
                #all_done(cache_destination, local_cache)
                pass

            if count >= start:

                address = process_row(current, row, keys, local_cache, city, feed_source, count)
            
                print

                local_cache[address] = current
                #save every time...
                #never know when a crash will happen:
                #however, this does make things run considerably slower
                #especially once the cached file size grows.
                #save_results(cache_destination, local_cache)

                #exit()
            
    all_done(cache_destination, local_cache)
Example #30
0
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "rentrocket.settings")

## from rentrocket import settings
## from django.core.management import setup_environ
## setup_environ(settings)

from city.models import City, to_tag

from helpers import save_json, load_json, Location, Geo, save_results, make_building

cache_file = "cities.json"
cache_destination = os.path.join(os.path.dirname(__file__), cache_file)
#keep a local copy of data we've processed...
#this should help with subsequent calls
#to make sure we don't need to duplicate calls to remote geolocation APIs:
saved_cities = load_json(cache_destination, create=True)

#geocoder helper:
geo = Geo()

cities = [
    ['Bloomington', 'IN', '', ''],
    ['Ann Arbor', 'MI', '', ''],
    ['Albany', 'NY', '', ''],
    ['Iowa City', 'IA', '', ''],
    ['Burlington', 'VT', '', ''],
    ['Austin', 'TX', '', ''],
    ['Columbia', 'MO', '', ''],
    ['Madison', 'WI', '', ''],
    ['Lawrence', 'KS', '', ''],
    ['Berkeley', 'CA', '', ''],
Example #31
0
 def load(self, str_data):
     try:
         self.data = helpers.load_json(str_data)
     except ValueError:
         return False
     return True
Example #32
0
from helpers import load_json
from helpers import print_progress
from copy import copy
import json

from nltk import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

#from tensorflow.python.keras.models import Model

transfer_values_train = np.load(
    'image_features/transfer_values/InceptionV3/transfer_values_train.npy')
transfer_values_test = np.load(
    'image_features/transfer_values/InceptionV3/transfer_values_test.npy')
captions_train = load_json('captions_train')

# LOAD THE CANDIDATE CAPTIONS
BS_filename = 'InceptionCaptions/9_beamsearched.json'
with open(BS_filename, 'r') as f:
    candidate_captions = json.load(f)
# Load the transfer model
# After first execution, you can comment these lines
#from tensorflow.python.keras.applications import VGG16
#image_model = VGG16(include_top=True, weights='imagenet')
#transfer_layer=image_model.get_layer('fc2')
#image_model_transfer = Model(inputs=image_model.input,
#                             outputs=transfer_layer.output)
image_dir = 'UAV/images/'
filenames_test = load_json('filenames_test')
Example #33
0
	def load( self, str_data):
		try:
			self.data = helpers.load_json( str_data)
		except ValueError:
			return False
		return True
## from rentrocket import settings
## from django.core.management import setup_environ
## setup_environ(settings)

from city.models import City, to_tag

from helpers import save_json, load_json, Location, Geo, save_results, make_building


cache_file = "cities.json" 
cache_destination = os.path.join(os.path.dirname(__file__), cache_file)
#keep a local copy of data we've processed...
#this should help with subsequent calls
#to make sure we don't need to duplicate calls to remote geolocation APIs:
saved_cities = load_json(cache_destination, create=True)


#geocoder helper:
geo = Geo()

cities = [ ['Bloomington', 'IN', '', ''],
           ['Ann Arbor', 'MI', '', ''],
           ['Albany', 'NY', '', ''],
           ['Iowa City', 'IA', '', ''],
           ['Burlington', 'VT', '', ''],
           ['Austin', 'TX', '', ''],
           ['Columbia', 'MO', '', ''],
           ['Madison', 'WI', '', ''],
           ['Lawrence', 'KS', '', ''],
           ['Berkeley', 'CA', '', ''],
Example #35
0
def test_migrate_record(frames_required, api_app, location, datadir, es,
                        users):
    """Test migrate date."""
    # [[ migrate the project ]]
    data = load_json(datadir, 'cds_records_demo_1_project.json')
    dump = CDSRecordDump(data=data[0])
    project = CDSRecordDumpLoader.create(dump=dump)
    p_id = project.id

    assert project['$schema'] == Project.get_record_schema()
    assert project['publication_date'] == '2016-01-05'
    assert 'license' not in project
    assert 'copyright' not in project
    assert project['_cds'] == {
        "state": {
            "file_transcode": "SUCCESS",
            "file_video_extract_frames": "SUCCESS",
            "file_video_metadata_extraction": "SUCCESS"
        },
        'modified_by': users[0],
    }

    # check project deposit
    deposit_project_uuid = PersistentIdentifier.query.filter_by(
        pid_type='depid', object_type='rec').one().object_uuid
    deposit_project = Record.get_record(deposit_project_uuid)
    assert Project._schema in deposit_project['$schema']
    assert project.revision_id == deposit_project[
        '_deposit']['pid']['revision_id']
    assert deposit_project['_deposit']['created_by'] == 1
    assert deposit_project['_deposit']['owners'] == [1]
    assert deposit_project['_files'] == []

    # [[ migrate the video ]]
    data = load_json(datadir, 'cds_records_demo_1_video.json')
    dump = CDSRecordDump(data=data[0])
    db.session.commit()

    def check_symlinks(video):
        symlinks_creator = SymlinksCreator()
        files = list(symlinks_creator._get_list_files(record=video))
        assert len(files) == 1
        for file_ in files:
            path = symlinks_creator._build_link_path(
                symlinks_creator._symlinks_location, video, file_['key'])
            assert os.path.lexists(path)

    def check_gif(video, mock_gif):
        # called only once for deposit
        (_, _, mock_args) = mock_gif.mock_calls[0]
        # check gif record
        video = CDSRecord(dict(video), video.model)
        # check gif deposit
        deposit = deposit_video_resolver(video['_deposit']['id'])
        master_video = CDSVideosFilesIterator.get_master_video_file(deposit)
        assert mock_args['master_id'] == master_video['version_id']
        assert str(deposit.files.bucket.id) == mock_args['bucket']
        #  assert mock_args['bucket'].id == deposit.files.bucket.id
        assert len(mock_args['frames']) == 10
        assert 'output_dir' in mock_args

    migration_streams = get_migration_streams(datadir=datadir)
    with mock.patch.object(DataCiteProvider, 'register'), \
            mock.patch.object(CDSRecordDumpLoader, '_create_frame',
                              side_effect=get_frames), \
            mock.patch.object(CDSRecordDumpLoader, '_get_minimum_frames',
                              return_value=frames_required) as mock_frames, \
            mock.patch.object(
                ExtractFramesTask, '_create_gif') as mock_gif, \
            mock.patch.object(
                CDSRecordDumpLoader, '_get_migration_file_stream_and_size',
                side_effect=migration_streams), \
            mock.patch.object(CDSRecordDumpLoader, '_clean_file_list'):
        video = CDSRecordDumpLoader.create(dump=dump)
        assert mock_frames.called is True
    db.session.add(video.model)
    video_id = video.id
    # check smil file
    smil_obj = ObjectVersion.query.filter_by(
        key='CERN-MOVIE-2012-193-001.smil', is_head=True).one()
    storage = smil_obj.file.storage()
    assert '<video src' in storage.open().read().decode('utf-8')
    # check video symlinks
    check_symlinks(video)
    # check gif
    check_gif(video, mock_gif)
    # check project
    project = Record.get_record(p_id)
    assert project['videos'] == [
        {'$ref': 'https://cds.cern.ch/api/record/1495143'}
    ]
    assert video['$schema'] == Video.get_record_schema()
    assert video['date'] == '2012-11-21'  # metadata data
    assert video['publication_date'] == '2017-07-13'  # creation date (DB)
    assert video['_project_id'] == '2093596'
    assert video['license'] == [{
        'license': 'CERN',
        'url': 'http://copyright.web.cern.ch',
    }]
    assert video['copyright'] == {
        'holder': 'CERN',
        'year': '2012',
        'url': 'http://copyright.web.cern.ch',
    }
    assert video['description'] == ''
    assert 'doi' in video
    assert video['_cds']['state'] == {
        "file_transcode": "SUCCESS",
        "file_video_extract_frames": "SUCCESS",
        "file_video_metadata_extraction": "SUCCESS"
    }
    assert 'extracted_metadata' in video['_cds']

    def check_files(video):
        bucket = CDSRecordDumpLoader._get_bucket(record=video)
        files = [dump_object(obj)
                 for obj in ObjectVersion.get_by_bucket(bucket=bucket)]
        for file_ in files:
            assert as_bucket(file_['bucket_id']) is not None
            assert 'checksum' in file_
            assert 'content_type' in file_
            assert 'context_type' in file_
            assert FileInstance.query.filter_by(
                id=file_['file_id']) is not None
            assert 'key' in file_
            assert 'links' in file_
            assert 'content_type' in file_
            assert 'context_type' in file_
            assert 'media_type' in file_
            assert 'tags' in file_

        # check extracted metadata
        master_video = CDSVideosFilesIterator.get_master_video_file(video)
        assert any([key in master_video['tags']
                    for key in ExtractMetadataTask._all_keys])
        assert any([key in video['_cds']['extracted_metadata']
                    for key in ExtractMetadataTask._all_keys])

    def check_buckets(record, deposit):
        def get(key, record):
            bucket = CDSRecordDumpLoader._get_bucket(record=record)
            files = [dump_object(obj)
                     for obj in ObjectVersion.get_by_bucket(bucket=bucket)]
            return [file_[key] for file_ in files]

        def check(record, deposit, file_key, different=None):
            values_record = set(get(file_key, record))
            values_deposit = set(get(file_key, deposit))
            difference = len(values_record - values_deposit)
            assert different == difference

        def check_tag_master(record):
            bucket = CDSRecordDumpLoader._get_bucket(record=record)
            master = CDSVideosFilesIterator.get_master_video_file(record)
            files = [dump_object(obj)
                     for obj in ObjectVersion.get_by_bucket(bucket=bucket)
                     if obj.get_tags().get('master')]
            assert all([file_['tags']['master'] == master['version_id']
                        for file_ in files])

        # 1 bucket record != 1 bucket deposit
        check(record, deposit, 'bucket_id', 1)
        # all file_id are the same except the smil file (only in record)
        check(record, deposit, 'file_id', 1)
        check(record, deposit, 'key', 1)
        # 18 object_version record != 17 object_version deposit
        check(record, deposit, 'version_id', 18)
        # check tag 'master' where is pointing
        check_tag_master(record)
        check_tag_master(deposit)

    def check_first_level_files(record):
        [master] = [file_ for file_ in deposit_video['_files']
                    if file_['context_type'] == 'master']
        assert len(master['subformat']) == 5
        assert len(master['frame']) == 10
        # TODO assert len(master['playlist']) == ??
        assert len([file_ for file_ in deposit_video['_files']
                    if file_['context_type'] == 'master']) == 1
        duration = float(record['_cds']['extracted_metadata']['duration'])
        for frame in master['frame']:
            assert float(frame['tags']['timestamp']) < duration
            assert float(frame['tags']['timestamp']) > 0
        # check tag 'preset_quality'
        pqs = [form['tags']['preset_quality'] for form in master['subformat']]
        assert sorted(pqs) == sorted(['1080p', '240p', '360p', '480p', '720p'])
        # check tag 'display_aspect_ratio'
        dar = set([form['tags']['display_aspect_ratio']
                   for form in master['subformat']])
        assert dar == {'16:9'}

    def check_pids(record):
        """Check pids."""
        assert record['report_number'][0] == 'CERN-VIDEO-2012-193-001'
        assert PersistentIdentifier.query.filter_by(
            pid_value='CERN-VIDEO-2012-193-001').count() == 1
        assert PersistentIdentifier.query.filter_by(
            pid_value='CERN-MOVIE-2012-193-001').count() == 1

    db.session.commit()

    # check video deposit
    deposit_video_uuid = PersistentIdentifier.query.filter(
        PersistentIdentifier.pid_type == 'depid',
        PersistentIdentifier.object_uuid != str(deposit_project_uuid),
        PersistentIdentifier.object_type == 'rec'
    ).one().object_uuid
    deposit_video = Video.get_record(str(deposit_video_uuid))
    assert Video._schema in deposit_video['$schema']
    video = Record.get_record(video_id)
    assert video.revision_id == deposit_video[
        '_deposit']['pid']['revision_id']
    assert deposit_video['_deposit']['created_by'] == users[0]
    assert deposit_video['_deposit']['owners'] == [users[0]]
    assert deposit_video['_project_id'] == '2093596'
    assert len(video['_files']) == 2
    assert len(deposit_video['_files']) == 2
    check_files(video)
    check_files(deposit_video)
    check_buckets(video, deposit_video)
    check_first_level_files(video)
    check_first_level_files(deposit_video)
    check_pids(video)

    # try to edit video
    deposit_video = deposit_video_resolver(deposit_video['_deposit']['id'])
    deposit_video = deposit_video.edit()

    # try to edit project
    deposit_project = deposit_project_resolver(
        deposit_project['_deposit']['id'])
    deposit_project = deposit_project.edit()

    login_user(User.query.filter_by(id=users[0]).first())
    deposit_video['title']['title'] = 'test'
    deposit_video = deposit_video.publish()
    _, record_video = deposit_video.fetch_published()
    assert record_video['title']['title'] == 'test'
Example #36
0
# This code computes the bleu score for the candidate sentence

import nltk
import math
from nltk import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.bleu_score import SmoothingFunction
import json

from helpers import load_json
#from NN_architecture import generate_caption
# run the NN architecture before
captions_test = load_json('captions_test')

##generate_caption(path+filenames_test[0])
##with open('generated_captions_VGG19.txt') as inFile:
#with open('captions_vgg16/4_generated_captions_VGG16.txt') as inFile:
#    generated_test_captions=inFile.readlines()
#for i in range(len(generated_test_captions)):
##    THIS LINE REMOVES THE FIRST EMPTY SPACE
##    generated_test_captions[i]=generated_test_captions[i][1:]
#    generated_test_captions[i]=generated_test_captions[i].replace('\n','')
#

# load from json
with open('captions_vgg16/12_generated_captions_VGG16.json') as inFile:
    generated_test_captions = json.load(inFile)
c_to_insert = generated_test_captions[883]
generated_test_captions.insert(884, c_to_insert)
def read_csv(source_csv, city_name, city_tag):
    city_options = City.objects.filter(tag=city_tag)
    print "Number of cities available: %s" % len(city_options)
    if not len(city_options):
        raise ValueError, "CITY NOT FOUND! run make_cities.py first"
        ## city = City()
        ## city.name = city_name
        ## city.tag = to_tag(city.name)
        ## city.save()
    else:
        city = city_options[0]

    print city

    feed_date = "2013-10-16"

    feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date)
    if feeds.exists():
        feed = feeds[0]
        print "Already had feed: %s, %s" % (feed.city, feed.added)
    else:
        feed = FeedInfo()
        feed.city = city
        feed.added = feed_date
        feed.version = "0.1"
        feed.save()
        print "Created new feed: %s" % feed.city.name

    people = Person.objects.filter(name="Blank")
    if people.exists():
        person = people[0]
        print "Already had person: %s" % (person.name)
    else:
        person = Person()
        person.name = "Blank"
        person.save()
        print "Created new person: %s" % person.name

    sources = Source.objects.filter(feed=feed)
    if sources.exists():
        feed_source = sources[0]
        print "Already had source: %s, %s" % (feed_source.feed.city,
                                              feed_source.feed.added)
    else:
        feed_source = Source()
        feed_source.feed = feed
        feed_source.person = person
        feed_source.save()
        print "Created new source: %s" % feed_source.feed.city.name

    cache_file = "%s.json" % city.tag
    cache_destination = os.path.join(os.path.dirname(source_csv), cache_file)
    #keep a local copy of data we've processed...
    #this should help with subsequent calls
    #to make sure we don't need to duplicate calls to remote geolocation APIs:
    local_cache = load_json(cache_destination, create=True)
    if not local_cache.has_key('buildings'):
        local_cache['buildings'] = {}
    if not local_cache.has_key('parcels'):
        local_cache['parcels'] = {}

    locations = {}
    for key, value in local_cache['buildings'].items():
        locations[key] = Location(value)

    #geocoder helper:
    geo = Geo()

    skips = 0
    #with codecs.open(source_csv, 'rb', encoding='utf-8') as csvfile:
    with open(source_csv) as csvfile:
        #reader = csv.reader(csvfile, delimiter=' ', quotechar='|')
        #reader = csv.reader(csvfile)
        #reader = unicodecsv.UnicodeReader(csvfile, encoding='utf-8')

        reader = unicode_csv_reader(csvfile)

        #just print the first row:
        print '>, <'.join(reader.next())

        count = 0

        #want to randomize the order... distribute options more evenly
        #print len(reader)
        #exit()
        #in order to randomize, should randomize the order in the csv
        for row in reader:
            count += 1
            print "Looking at row: %s" % count

            #could exit out early here, if needed
            if count > 10:
                #exit()
                pass

            print row
            address = row[0]

            ## no_units = row[12]

            #can pass this in as bldg_id to make_building
            #that gets used for parcel too
            parcel_id = row[1]
            bldg_id = parcel_id

            street_num = row[2]
            street_dir = row[3]
            street_name = row[4]
            street_sfx = row[5]
            #eg building number
            qualifier_pre = row[6]
            #eg "UNIT" or "APT"
            qualifier_post = row[7]
            apt_num = row[8]
            #skip row9 (in/out... whatever that means)
            zip_code = row[10]
            #skip row11, assessor id
            #skip row12, address num
            #skip row13, x
            #skip row14, y
            #xcoord == lng
            lng = row[15]
            lat = row[16]

            #entry floor number: (named 'z' in sheet)
            floor = row[17]

            #skip row18, strcid... not sure
            #skip row19, parent
            #skip row20, app_
            #skip row21, hteloc
            zone = row[22]
            bldg_type = row[23]
            #number of buildings
            bldg_num = row[24]
            no_units = row[25]

            #skip row[26], inspection type
            #skip row27, app number
            #skip row28, date received
            #skip row29, application type
            #skip row30, ownerid
            #skip row31, operator id
            #skip row32, agent_id
            #skip row33, mail to
            central_heat = row[34]
            if central_heat == 'Y':
                central_heat = True
            else:
                central_heat = False

            #heat mechanism? heat mechanic??? not sure
            heat_mech = row[35]
            #skip row36, agent id (2)
            #skip row37, agent last name
            #skip row38 agent first name
            #skip row39 agent middle initial
            #skip row40, agent title
            #skip row41, business name

            #could be owner, could be agent
            owner_name = row[42]
            owner_address1 = row[43]
            owner_address2 = row[44]
            owner_city = row[45]
            owner_state = row[46]
            owner_zip = row[47]

            #address = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre, qualifier_post, apt_num])

            address_main = " ".join([
                street_num, street_dir, street_name, street_sfx, qualifier_pre
            ])
            address_main = address_main.strip()
            #get rid of any double spaces
            address_main = address_main.replace("  ", " ")

            apt_main = " ".join([qualifier_post, apt_num])
            apt_main = apt_main.strip()

            address = address_main
            print address

            owner_address = ", ".join([
                owner_address1, owner_address2, owner_city, owner_state,
                owner_zip
            ])

            ## #should always be "RENTAL" (don't need to track this one)
            ## permit_type = row[1]
            ## if not permit_type == "RENTAL" and not permit_type == "MECHANICAL":
            ##     raise ValueError, "Unexpected permit type: %s in row: %s" % (
            ##         permit_type, row)

            ## bldg_type = row[2]

            ## #can use this to filter out non-rental or obsolete entries
            ## #don't need to track otherwise:
            ## status = row[3]
            ## parcel_id = row[4]

            ## #should be fixed per source:
            ## ss_city = row[6]

            ## bldg_sf = row[7]
            ## no_bldgs = row[8]
            ## applicant_name = row[9]
            ## no_stories = row[10]
            ## no_units = row[11]

            ## sqft = row[7]
            ## number_of_buildings = row[8]
            ## applicant_name = row[9]
            ## number_of_stories = row[10]
            ## number_of_units = row[11]

            #check if this is one we want to skip
            if conversions.has_key(address.upper()):
                address = conversions[address.upper()]

            ## if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']):

            #make sure it's not one we're skipping:
            if not address:
                print "SKIPPING ITEM: %s" % row[1]
                skips += 1
            else:
                #check if we've started processing any results for this row
                if locations.has_key(address.upper()):
                    location = locations[address.upper()]
                else:
                    location = Location()

            #temporarily just want to look at google again
            #location.sources = ["google"]
            #location.sources = ["google", "bing"]
            #location.sources = ["google", "bing", "usgeo", "geonames", "openmq"]
            #skip geocoding for columbia
            location.sources = []

            #do some geocoding, as needed:
            search = "%s, %s, %s" % (address.upper(), city_name, city.state)

            any_updated = False
            for geo_source in location.sources:
                update = geo.lookup(search, geo_source, location, force=True)
                #update = geo.lookup(search, geo_source, location, force=False)
                if update:
                    any_updated = True

            location.sources = [
                'csv', "google", "bing", "usgeo", "geonames", "openmq", "mq"
            ]

            #manually add data from csv here:
            result = []
            result.append({'place': address, 'lat': lat, 'lng': lng})
            setattr(location, 'csv', result)

            #this is the case for brand new searches
            #(which are updated in a different sense)
            if not hasattr(location,
                           "address_alt") or not location.address_alt:
                any_updated = True

            location.address_alt = search
            #location.bldg_units = bldg_units
            #location.units_bdrms = units_bdrms
            locations[address.upper()] = location

            #handle the database storage
            bldg = make_building(location,
                                 bldg_id,
                                 city,
                                 feed_source,
                                 no_units=no_units,
                                 bldg_type=bldg_type)

            if apt_main:
                unit = make_unit(apt_main, bldg)

            (person, bldg_person) = make_person(owner_name,
                                                bldg,
                                                "Agent",
                                                address=owner_address)

            if any_updated:
                #back it up for later
                #enable this when downloading GPS coordinates...
                #the rest of the time it slows things down
                local_cache['buildings'] = {}
                for key, value in locations.items():
                    local_cache['buildings'][key] = value.to_dict()
                save_json(cache_destination, local_cache)

            print

            #exit()

    destination = '%s.tsv' % city_tag
    save_results(locations, destination)
def read_csv(source_csv, city_name, city_tag):
    city_options = City.objects.filter(tag=city_tag)
    print "Number of cities available: %s" % len(city_options)
    if not len(city_options):
        raise ValueError, "CITY NOT FOUND! run make_cities.py first"
        ## city = City()
        ## city.name = city_name
        ## city.tag = to_tag(city.name)
        ## city.save()
    else:
        city = city_options[0]

    print city

    feed_date = "2013-10-16"

    feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date)
    if feeds.exists():
        feed = feeds[0]
        print "Already had feed: %s, %s" % (feed.city, feed.added)
    else:
        feed = FeedInfo()
        feed.city = city
        feed.added = feed_date
        feed.version = "0.1"
        feed.save()
        print "Created new feed: %s" % feed.city.name

    people = Person.objects.filter(name="Blank")
    if people.exists():
        person = people[0]
        print "Already had person: %s" % (person.name)
    else:
        person = Person()
        person.name = "Blank"
        person.save()
        print "Created new person: %s" % person.name

    sources = Source.objects.filter(feed=feed)
    if sources.exists():
        feed_source = sources[0]
        print "Already had source: %s, %s" % (feed_source.feed.city, feed_source.feed.added)
    else:
        feed_source = Source()
        feed_source.feed = feed
        feed_source.person = person
        feed_source.save()
        print "Created new source: %s" % feed_source.feed.city.name


    cache_file = "%s.json" % city.tag
    cache_destination = os.path.join(os.path.dirname(source_csv), cache_file)
    #keep a local copy of data we've processed...
    #this should help with subsequent calls
    #to make sure we don't need to duplicate calls to remote geolocation APIs:
    local_cache = load_json(cache_destination, create=True)
    if not local_cache.has_key('buildings'):
        local_cache['buildings'] = {}
    if not local_cache.has_key('parcels'):
        local_cache['parcels'] = {}
    
    locations = {}
    for key, value in local_cache['buildings'].items():
        locations[key] = Location(value)

    #geocoder helper:
    geo = Geo()

    skips = 0
    #with codecs.open(source_csv, 'rb', encoding='utf-8') as csvfile:
    with open(source_csv) as csvfile:
        #reader = csv.reader(csvfile, delimiter=' ', quotechar='|')
        #reader = csv.reader(csvfile)
        #reader = unicodecsv.UnicodeReader(csvfile, encoding='utf-8')

        reader = unicode_csv_reader(csvfile)

        #just print the first row:
        print '>, <'.join(reader.next())

        count = 0

        #want to randomize the order... distribute options more evenly
        #print len(reader)
        #exit()
        #in order to randomize, should randomize the order in the csv
        for row in reader:
            count += 1
            print "Looking at row: %s" % count
            
            #could exit out early here, if needed
            if count > 10:
                #exit()
                pass

            print row
            address = row[0]


            ## no_units = row[12]


            #can pass this in as bldg_id to make_building
            #that gets used for parcel too
            parcel_id = row[1]
            bldg_id = parcel_id

            street_num = row[2]
            street_dir = row[3]
            street_name = row[4]
            street_sfx = row[5]
            #eg building number
            qualifier_pre = row[6]
            #eg "UNIT" or "APT"
            qualifier_post = row[7]
            apt_num = row[8]
            #skip row9 (in/out... whatever that means)
            zip_code = row[10]
            #skip row11, assessor id
            #skip row12, address num
            #skip row13, x
            #skip row14, y
            #xcoord == lng
            lng = row[15]
            lat = row[16]

            #entry floor number: (named 'z' in sheet)
            floor = row[17]

            #skip row18, strcid... not sure
            #skip row19, parent
            #skip row20, app_
            #skip row21, hteloc
            zone = row[22]
            bldg_type = row[23]
            #number of buildings
            bldg_num = row[24]
            no_units = row[25]

            #skip row[26], inspection type
            #skip row27, app number
            #skip row28, date received
            #skip row29, application type
            #skip row30, ownerid
            #skip row31, operator id
            #skip row32, agent_id
            #skip row33, mail to
            central_heat = row[34]
            if central_heat == 'Y':
                central_heat = True
            else:
                central_heat = False

            #heat mechanism? heat mechanic??? not sure
            heat_mech = row[35]
            #skip row36, agent id (2)
            #skip row37, agent last name
            #skip row38 agent first name
            #skip row39 agent middle initial
            #skip row40, agent title
            #skip row41, business name

            #could be owner, could be agent
            owner_name = row[42]
            owner_address1 = row[43]
            owner_address2 = row[44]
            owner_city = row[45]
            owner_state = row[46]
            owner_zip = row[47]

            
            #address = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre, qualifier_post, apt_num])

            address_main = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre])
            address_main = address_main.strip()
            #get rid of any double spaces
            address_main = address_main.replace("  ", " ")
            
            apt_main = " ".join([qualifier_post, apt_num])
            apt_main = apt_main.strip()

            address = address_main
            print address

            owner_address = ", ".join([owner_address1, owner_address2, owner_city, owner_state, owner_zip])
            
            ## #should always be "RENTAL" (don't need to track this one)
            ## permit_type = row[1]
            ## if not permit_type == "RENTAL" and not permit_type == "MECHANICAL":
            ##     raise ValueError, "Unexpected permit type: %s in row: %s" % (
            ##         permit_type, row)
            
            ## bldg_type = row[2]
            
            ## #can use this to filter out non-rental or obsolete entries
            ## #don't need to track otherwise:
            ## status = row[3]
            ## parcel_id = row[4]

            ## #should be fixed per source:
            ## ss_city = row[6]

            ## bldg_sf = row[7]
            ## no_bldgs = row[8]
            ## applicant_name = row[9]
            ## no_stories = row[10]
            ## no_units = row[11]

            ## sqft = row[7]
            ## number_of_buildings = row[8]
            ## applicant_name = row[9]
            ## number_of_stories = row[10]
            ## number_of_units = row[11]
            
            #check if this is one we want to skip
            if conversions.has_key(address.upper()):
                address = conversions[address.upper()]

            ## if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']):

            #make sure it's not one we're skipping:
            if not address:
                print "SKIPPING ITEM: %s" % row[1]
                skips += 1
            else:
                #check if we've started processing any results for this row
                if locations.has_key(address.upper()):
                    location = locations[address.upper()]
                else:
                    location = Location()

            #temporarily just want to look at google again
            #location.sources = ["google"]
            #location.sources = ["google", "bing"]
            #location.sources = ["google", "bing", "usgeo", "geonames", "openmq"]
            #skip geocoding for columbia
            location.sources = []
            
            #do some geocoding, as needed:
            search = "%s, %s, %s" % (address.upper(), city_name, city.state)

            any_updated = False
            for geo_source in location.sources:
                update = geo.lookup(search, geo_source, location, force=True)
                #update = geo.lookup(search, geo_source, location, force=False)
                if update:
                    any_updated = True

            location.sources = ['csv', "google", "bing", "usgeo", "geonames", "openmq", "mq"]

            #manually add data from csv here:
            result = []
            result.append({'place': address, 'lat': lat, 'lng': lng})
            setattr(location, 'csv', result)

            #this is the case for brand new searches
            #(which are updated in a different sense)
            if not hasattr(location, "address_alt") or not location.address_alt:
                any_updated = True

            location.address_alt = search
            #location.bldg_units = bldg_units
            #location.units_bdrms = units_bdrms
            locations[address.upper()] = location

            #handle the database storage
            bldg = make_building(location, bldg_id, city, feed_source, no_units=no_units, bldg_type=bldg_type)

            if apt_main:
                unit = make_unit(apt_main, bldg)

            (person, bldg_person) = make_person(owner_name, bldg, "Agent", address=owner_address)


            if any_updated:
                #back it up for later
                #enable this when downloading GPS coordinates...
                #the rest of the time it slows things down
                local_cache['buildings'] = {}
                for key, value in locations.items():
                    local_cache['buildings'][key] = value.to_dict()
                save_json(cache_destination, local_cache)

            print

            #exit()
            
    destination = '%s.tsv' % city_tag
    save_results(locations, destination)
Example #39
0
# LOAD THE TRANSFER MODEL
# can do thi only at first execution
#
#from tensorflow.python.keras.applications import VGG16
#from tensorflow.python.keras.models import Model
#image_model = VGG16(include_top=True, weights='imagenet')
#transfer_layer=image_model.get_layer('fc2')
#image_model_transfer = Model(inputs=image_model.input,
#                             outputs=transfer_layer.output)

transfer_values_train = np.load(
    'image_features/transfer_values/InceptionV3/transfer_values_train.npy')
transfer_values_test = np.load(
    'image_features/transfer_values/InceptionV3/transfer_values_test.npy')
captions_train = load_json('captions_train')
filename = 'InceptionCaptions/5_beamsearched.json'

out_dir = 'best_beamsearched/InceptionV3/'

with open(filename, 'r') as inFile:
    beamCaptions = json.load(inFile)
    beamCaptions = tuple(beamCaptions)


def get_transfer_values(image_path):
    tv_len = transfer_values_test[0].shape[0]
    filename = image_path[len(image_dir):]
    for i in range(len(filenames_test)):
        if filenames_test[i] == filename:
            break
Example #40
0
def test_migrate_record(app, location, datadir, es):
    """Test migrate date."""
    # create the project
    data = load_json(datadir, 'cds_records_demo_1_project.json')
    dump = CDSRecordDump(data=data[0])
    project = CDSRecordDumpLoader.create(dump=dump)
    p_id = project.id

    date = '2015-11-13'
    assert project['$schema'] == Project.get_record_schema()
    assert project['date'] == date
    assert project['publication_date'] == date
    assert 'license' not in project
    assert 'copyright' not in project
    assert project['_cds'] == {
        "state": {
            "file_transcode": "SUCCESS",
            "file_video_extract_frames": "SUCCESS",
            "file_video_metadata_extraction": "SUCCESS"
        },
        'modified_by': None,
    }

    # check project deposit
    deposit_project_uuid = PersistentIdentifier.query.filter_by(
        pid_type='depid', object_type='rec').one().object_uuid
    deposit_project = Record.get_record(deposit_project_uuid)
    assert Project._schema in deposit_project['$schema']
    assert project.revision_id == deposit_project[
        '_deposit']['pid']['revision_id']
    assert deposit_project['_deposit']['created_by'] == -1
    assert deposit_project['_deposit']['owners'] == [-1]
    assert deposit_project['_files'] == []

    # create the video
    data = load_json(datadir, 'cds_records_demo_1_video.json')
    dump = CDSRecordDump(data=data[0])

    def load_video(*args, **kwargs):
        return open(join(datadir, 'test.mp4'), 'rb')

    with mock.patch.object(DataCiteProvider, 'register') as mock_datacite, \
            mock.patch.object(
                CDSRecordDumpLoader, '_get_migration_file_stream',
                return_value=load_video()):
        video = CDSRecordDumpLoader.create(dump=dump)
        # assert mock_datacite.called is True
    project = Record.get_record(p_id)
    assert project['videos'] == [
        {'$ref': 'https://cds.cern.ch/api/record/1495143'}
    ]
    assert video['$schema'] == Video.get_record_schema()
    date = '2012-11-20'
    assert video['date'] == date
    assert video['publication_date'] == date
    assert video['_project_id'] == '2093596'
    assert video['license'] == [{
        'license': 'CERN',
        'url': 'http://copyright.web.cern.ch',
    }]
    assert video['copyright'] == {
        'holder': 'CERN',
        'year': '2012',
        'url': 'http://copyright.web.cern.ch',
    }
    assert video['description'] == ''
    assert 'doi' in video
    assert video['_cds']['state'] == {
        "file_transcode": "SUCCESS",
        "file_video_extract_frames": "SUCCESS",
        "file_video_metadata_extraction": "SUCCESS"
    }
    assert 'extracted_metadata' in video['_cds']

    def check_files(video):
        bucket = CDSRecordDumpLoader._get_bucket(record=video)
        files = [dump_object(obj)
                 for obj in ObjectVersion.get_by_bucket(bucket=bucket)]
        for file_ in files:
            assert as_bucket(file_['bucket_id']) is not None
            assert 'checksum' in file_
            assert 'content_type' in file_
            assert 'context_type' in file_
            assert FileInstance.query.filter_by(
                id=file_['file_id']) is not None
            assert 'key' in file_
            assert 'links' in file_
            assert 'content_type' in file_
            assert 'context_type' in file_
            assert 'media_type' in file_
            assert 'tags' in file_

        # check extracted metadata
        master_video = CDSVideosFilesIterator.get_master_video_file(video)
        assert any([key in master_video['tags']
                    for key in ExtractMetadataTask._all_keys])
        assert any([key in video['_cds']['extracted_metadata']
                    for key in ExtractMetadataTask._all_keys])

    def check_buckets(record, deposit):
        def get(key, record):
            bucket = CDSRecordDumpLoader._get_bucket(record=record)
            files = [dump_object(obj)
                     for obj in ObjectVersion.get_by_bucket(bucket=bucket)]
            return [file_[key] for file_ in files]

        def check(record, deposit, file_key, different=None):
            values_record = set(get(file_key, record))
            values_deposit = set(get(file_key, deposit))
            difference = len(values_record - values_deposit)
            assert different == difference

        def check_tag_master(record):
            bucket = CDSRecordDumpLoader._get_bucket(record=record)
            master = CDSVideosFilesIterator.get_master_video_file(record)
            files = [dump_object(obj)
                     for obj in ObjectVersion.get_by_bucket(bucket=bucket)
                     if obj.get_tags().get('master')]
            assert all([file_['tags']['master'] == master['version_id']
                        for file_ in files])

        # 1 bucket record != 1 bucket deposit
        check(record, deposit, 'bucket_id', 1)
        # all file_id are the same except the smil file (only in record)
        check(record, deposit, 'file_id', 1)
        check(record, deposit, 'key', 1)
        # 18 object_version record != 17 object_version deposit
        check(record, deposit, 'version_id', 18)
        # check tag 'master' where is pointing
        check_tag_master(record)
        check_tag_master(deposit)

    def check_first_level_files(record):
        [master] = [file_ for file_ in deposit_video['_files']
                    if file_['context_type'] == 'master']
        assert len(master['subformat']) == 5
        assert len(master['frame']) == 10
        # TODO assert len(master['playlist']) == ??
        assert len([file_ for file_ in deposit_video['_files']
                    if file_['context_type'] == 'master']) == 1
        duration = float(record['_cds']['extracted_metadata']['duration'])
        for frame in master['frame']:
            assert float(frame['tags']['timestamp']) < duration
            assert float(frame['tags']['timestamp']) > 0

    # check video deposit
    deposit_video_uuid = PersistentIdentifier.query.filter(
        PersistentIdentifier.pid_type == 'depid',
        PersistentIdentifier.object_uuid != str(deposit_project_uuid),
        PersistentIdentifier.object_type == 'rec'
    ).one().object_uuid
    deposit_video = Video.get_record(str(deposit_video_uuid))
    assert Video._schema in deposit_video['$schema']
    assert video.revision_id == deposit_video[
        '_deposit']['pid']['revision_id']
    assert deposit_video['_deposit']['created_by'] == -1
    assert deposit_video['_deposit']['owners'] == [-1]
    assert len(video['_files']) == 2
    assert len(deposit_video['_files']) == 2
    check_files(video)
    check_files(deposit_video)
    check_buckets(video, deposit_video)
    check_first_level_files(video)
    check_first_level_files(deposit_video)

    # try to edit video
    deposit_video = deposit_video_resolver(deposit_video['_deposit']['id'])
    deposit_video = deposit_video.edit()

    # try to edit project
    deposit_project = deposit_project_resolver(
        deposit_project['_deposit']['id'])
    deposit_project = deposit_project.edit()

    # try to publish again the video
    deposit_video['title']['title'] = 'test'
    deposit_video = deposit_video.publish()
    _, record_video = deposit_video.fetch_published()
    assert record_video['title']['title'] == 'test'
Example #41
0
def read_csv(source_csv, city_name, city_tag):
    city_options = City.objects.filter(tag=city_tag)
    print "Number of cities available: %s" % len(city_options)
    if not len(city_options):
        raise ValueError, "CITY NOT FOUND! run make_cities.py first"
        ## city = City()
        ## city.name = city_name
        ## city.tag = to_tag(city.name)
        ## city.save()
    else:
        city = city_options[0]

    print city

    feed_date = "2013-10-16"

    feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date)
    if feeds.exists():
        feed = feeds[0]
        print "Already had feed: %s, %s" % (feed.city, feed.added)
    else:
        feed = FeedInfo()
        feed.city = city
        feed.added = feed_date
        feed.version = "0.1"
        feed.save()
        print "Created new feed: %s" % feed.city.name

    people = Person.objects.filter(name="Blank")
    if people.exists():
        person = people[0]
        print "Already had person: %s" % (person.name)
    else:
        person = Person()
        person.name = "Blank"
        person.save()
        print "Created new person: %s" % person.name

    sources = Source.objects.filter(feed=feed)
    if sources.exists():
        feed_source = sources[0]
        print "Already had source: %s, %s" % (feed_source.feed.city, feed_source.feed.added)
    else:
        feed_source = Source()
        feed_source.feed = feed
        feed_source.person = person
        feed_source.save()
        print "Created new source: %s" % feed_source.feed.city.name

    cache_file = "%s-20150525.json" % city.tag
    cache_destination = os.path.join(os.path.dirname(source_csv), cache_file)
    # keep a local copy of data we've processed...
    # this should help with subsequent calls
    # to make sure we don't need to duplicate calls to remote geolocation APIs:
    local_cache = load_json(cache_destination, create=True)
    if not local_cache.has_key("buildings"):
        local_cache["buildings"] = {}

    search_results = {}
    for key, value in local_cache["buildings"].items():
        # search_results[key] = Location(value)
        sr = SearchResults()
        sr.from_dict(value)
        # print
        # print sr
        # print
        search_results[key] = sr

    # geocoder helper:
    # geo = Geo()

    skips = 0
    with open(source_csv) as csvfile:

        reader = unicode_csv_reader(csvfile)

        # just print the first row:
        print ">, <".join(reader.next())

        count = 0

        # want to randomize the order... distribute options more evenly
        # print len(reader)
        # exit()
        # in order to randomize, should randomize the order in the csv
        for row in reader:
            count += 1
            print "Looking at row: %s" % count

            any_updated = False

            # could exit out early here, if needed
            if count > 10:
                # exit()
                pass

            # if you want to skip ahead more quickly:
            if count < 27187:
                pass
            else:

                # print row
                objectid = row[0]

                ## no_units = row[12]

                # can pass this in as bldg_id to make_building
                # that gets used for parcel too
                parcel_id = row[1]
                bldg_id = parcel_id

                street_num = row[2]
                street_dir = row[3]
                street_name = row[4]
                street_sfx = row[5]
                # eg building number
                qualifier_pre = row[6]
                # eg "UNIT" or "APT"
                qualifier_post = row[7]
                apt_num = row[8]
                # skip row9 (in/out... whatever that means)
                zip_code = row[10]
                # skip row11, assessor id
                # skip row12, address num
                # skip row13, x
                # skip row14, y
                # xcoord == lng
                lng = row[15]
                lat = row[16]

                # entry floor number: (named 'z' in sheet)
                floor = row[17]

                # skip row18, strcid... not sure
                # skip row19, parent
                # skip row20, app_
                # skip row21, hteloc
                zone = row[22]
                bldg_type = row[23]
                # number of buildings
                bldg_num = row[24]
                no_units = row[25]

                # skip row[26], inspection type
                # skip row27, app number
                # skip row28, date received
                # skip row29, application type
                # skip row30, ownerid
                # skip row31, operator id
                # skip row32, agent_id
                # skip row33, mail to
                central_heat = row[34]
                if central_heat == "Y":
                    central_heat = True
                else:
                    central_heat = False

                # heat mechanism? heat mechanic??? not sure
                heat_mech = row[35]
                # skip row36, agent id (2)
                # skip row37, agent last name
                # skip row38 agent first name
                # skip row39 agent middle initial
                # skip row40, agent title
                # skip row41, business name

                # could be owner, could be agent
                owner_name = row[42]
                owner_address1 = row[43]
                owner_address2 = row[44]
                owner_city = row[45]
                owner_state = row[46]
                owner_zip = row[47]

                # address = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre, qualifier_post, apt_num])

                # this is causing problems with lookups in google
                if (
                    qualifier_pre == "DUP"
                    or qualifier_pre == "DUPE"
                    or qualifier_pre == "2-Jan"
                    or qualifier_pre == "HM"
                    or qualifier_pre == "DWN"
                ):
                    qualifier_pre = ""

                address_main = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre])
                address_main = address_main.strip()
                # get rid of any double spaces
                address_main = address_main.replace("  ", " ")

                # similar to conversions,
                # but there are too many of these to list there
                if re.search("HOLLY RIDGE LN", address_main):
                    address_main = address_main.replace("HOLLY RIDGE LN", "HOLLYRIDGE LN")
                if re.search("BERKSHIRE CT", address_main):
                    address_main = address_main.replace("BERKSHIRE CT", "BERKSHIRE")
                    # address_main = ''
                if re.search("CAMERON CT", address_main):
                    address_main = address_main.replace("CAMERON CT", "CAMERON")
                    # address_main = ''
                if re.search("ATHENS CT", address_main):
                    address_main = address_main.replace("ATHENS CT", "ATHENS")
                    # address_main = ''
                if re.search("LAMAR CT", address_main):
                    address_main = address_main.replace("LAMAR CT", "LAMAR")
                    # address_main = ''
                if re.search("MONITEAU CT", address_main):
                    address_main = address_main.replace("MONITEAU CT", "MONITEAU")
                    # address_main = ''
                if re.search("IMPERIAL CT", address_main):
                    address_main = ""
                if re.search("PERKINS DR", address_main):
                    address_main = ""
                if re.search("GRANITE OAKS CT", address_main):
                    address_main = ""

                # sometimes the 'BLDG' data is added in the wrong place
                # then it gets treated as a unit item
                # (but it's not *always* a unit item, so can't generalize it that way)
                if qualifier_post == "BLDG" or qualifier_post == "LOT":
                    address_main = " ".join([address_main, qualifier_post, apt_main])
                    address_main = address_main.strip()
                    apt_main = ""
                else:
                    apt_main = " ".join([qualifier_post, apt_num])
                    apt_main = apt_main.strip()

                # check if this is one we want to skip
                if conversions.has_key(address_main.upper()):
                    address_main = conversions[address_main.upper()]

                if address_main:
                    print "APT_MAIN: ", apt_main
                    address = ", ".join([address_main, apt_main])
                else:
                    address = ""

                owner_address = ", ".join([owner_address1, owner_address2, owner_city, owner_state, owner_zip])

                ## if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']):

                print "Parcel ID:", parcel_id
                print address

                results = None

                # make sure it's not one we're skipping:
                if not address:
                    print "SKIPPING ITEM: %s" % row[1]
                    skips += 1

                    skipf = codecs.open("skips.txt", "a", encoding="utf-8")
                    original = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre])
                    skipf.write(original)
                    skipf.write("\n")
                    skipf.close()

                else:
                    # check if we've started processing any results for this row
                    if search_results.has_key(address.upper()):
                        print "Already had building: %s" % address
                        results = search_results[address.upper()]
                        # print results
                    else:

                        addy = ", ".join([address_main, city.name, city.state])
                        addy += " " + zip_code
                        # addy += ", USA"
                        print addy

                        # toggle betweeen an actual google query
                        results = address_search(addy, apt_main)

                        # print dir(results)

                        if len(results.matches) > 1:
                            print results
                            for option in results.matches:
                                print "%s: %s, %s" % (option["place"], option["lat"], option["lng"])
                            print
                            print "Source Lat: %s, Lng: %s" % (lat, lng)
                            src_lat = int(float(lat) * 100)
                            src_lng = int(float(lng) * 100)

                            matched = False
                            for current in results.matches:
                                # current = results.matches[0]
                                print current["lat"]
                                print current["lng"]
                                # only want to look at the first 2 decimal places:
                                comp_lat = int(float(current["lat"]) * 100)
                                comp_lng = int(float(current["lng"]) * 100)
                                print comp_lat
                                print comp_lng

                                if (src_lat == comp_lat) and (src_lng == comp_lng):
                                    # results.matches = results.matches[:1]
                                    results.matches = [current]
                                    matched = True

                            if not matched:
                                print "DIDN'T MATCH!"
                                exit()

                        any_updated = True

                        # or just using results as specified in csv
                        # (THIS DOES NOT NORMALIZE THE ADDRESS VIA GOOGLE)
                        # results = SearchResults()
                        # results.unit_text = apt_main
                        # handle_place(results, addy, lat, lng, apt_main)

                    assert results
                    # print results

                    lookup_building_with_geo(results, make=True, parcel_id=parcel_id)
                    # print results
                    # current['results'] = results

                    # print results

                    if results.errors:
                        print results
                        raise ValueError, results.errors
                    else:

                        search_results[address.upper()] = results

                        bldg = results.building
                        assert bldg
                        unit = results.unit

                        # may be a case where the unit is blank
                        # and another unit with an number/letter was created earlier
                        # in that case, we won't be creating one here
                        # and the building will already exist...
                        # not necessarily an error though
                        # just redundant data
                        # assert unit

                        (person, bldg_person) = make_person(owner_name, bldg, "Agent", address=owner_address)

                    # time.sleep(1)

            if any_updated:
                # back it up for later
                # enable this when downloading GPS coordinates...
                # the rest of the time it slows things down
                local_cache["buildings"] = {}
                for key, value in search_results.items():
                    # search_results[key] = SearchResults().from_dict(value)
                    local_cache["buildings"][key] = value.to_dict()
                save_json(cache_destination, local_cache)

            print
    loss_mean = tf.reduce_mean(loss)

    return loss_mean

# This code is used to generate captions
# the CNN model, as well as the image size, has to be specified
image_model = VGG16(include_top=True, weights='imagenet')
transfer_layer=image_model.get_layer('fc2')
image_model_transfer = Model(inputs=image_model.input,
                             outputs=transfer_layer.output)
img_size=K.int_shape(image_model.input)[1:3]

# recreate the tokenizer
mark_start='ssss '
mark_end=' eeee'
captions_train=load_json('captions_train_saifullah')
captions_train_marked=mark_captions(captions_train)
captions_train_flat=flatten(captions_train_marked)
tokenizer=TokenizerWrap(texts=captions_train_flat,
                        num_words=167)
token_start=tokenizer.word_index[mark_start.strip()]
token_end=tokenizer.word_index[mark_end.strip()]
tokens_train=tokenizer.captions_to_tokens(captions_train_marked)

def generate_caption(image_path, max_tokens=30):
    """
    Generate a caption for the image in the given path.
    The caption is limited to the given number of tokens (words).
    """

    # Load and resize the image.
def read_csv(source_csv, city_name, city_tag, driver):
    city_options = City.objects.filter(tag=city_tag)
    print "Number of cities available: %s" % len(city_options)
    if not len(city_options):
        raise ValueError, "CITY NOT FOUND! run make_cities.py first"
        ## city = City()
        ## city.name = city_name
        ## city.tag = to_tag(city.name)
        ## city.save()
    else:
        city = city_options[0]

    print city

    position_file = "position.json"
    position = load_json(position_file, create=True)
    if not position:
        position = 0

    cache_file = "%s-20150525.json.bkup" % city.tag
    cache_destination = os.path.join(os.path.dirname(source_csv), cache_file)
    #keep a local copy of data we've processed...
    #this should help with subsequent calls
    #to make sure we don't need to duplicate calls to remote geolocation APIs:
    local_cache = load_json(cache_destination, create=True)
    if not local_cache.has_key('buildings'):
        local_cache['buildings'] = {}
    
    search_results = {}
    for key, value in local_cache['buildings'].items():
        #search_results[key] = Location(value)
        sr = SearchResults()
        sr.from_dict(value)
        #print
        #print sr
        #print 
        search_results[key] = sr

    #geocoder helper:
    #geo = Geo()

    provider = ''
    provider_options = ServiceProvider.objects.filter(name='City of Columbia')
    if len(provider_options):
        provider = provider_options[0]
    else:
        raise ValueError, "error finding utility_provider: %s matches" % len(provider_options)                    


    skips = 0
    with open(source_csv) as csvfile:

        reader = unicode_csv_reader(csvfile)

        #just print the first row:
        print '>, <'.join(reader.next())

        count = 0

        #want to randomize the order... distribute options more evenly
        #print len(reader)
        #exit()
        #in order to randomize, should randomize the order in the csv
        for row in reader:
            count += 1
            print "Looking at row: %s, position: %s" % (count, position)
            start = datetime.now()
            print "Started: ", start
            
            any_updated = False
            
            #could exit out early here, if needed
            if count > 10:
                #exit()
                pass

            #if you want to skip ahead more quickly:
            #if count < 0:
            if count < position:
                pass
            else:

                #print row
                objectid = row[0]


                ## no_units = row[12]


                #can pass this in as bldg_id to make_building
                #that gets used for parcel too
                parcel_id = row[1]
                bldg_id = parcel_id

                street_num = row[2]
                street_dir = row[3]
                street_name = row[4]
                street_sfx = row[5]
                #eg building number
                qualifier_pre = row[6]
                #eg "UNIT" or "APT"
                qualifier_post = row[7]
                apt_num = row[8]
                #skip row9 (in/out... whatever that means)
                zip_code = row[10]
                #skip row11, assessor id
                #skip row12, address num
                #skip row13, x
                #skip row14, y
                #xcoord == lng
                lng = row[15]
                lat = row[16]

                #entry floor number: (named 'z' in sheet)
                floor = row[17]

                #skip row18, strcid... not sure
                #skip row19, parent
                #skip row20, app_
                #skip row21, hteloc
                zone = row[22]
                bldg_type = row[23]
                #number of buildings
                bldg_num = row[24]
                no_units = row[25]

                #skip row[26], inspection type
                #skip row27, app number
                #skip row28, date received
                #skip row29, application type
                #skip row30, ownerid
                #skip row31, operator id
                #skip row32, agent_id
                #skip row33, mail to
                central_heat = row[34]
                if central_heat == 'Y':
                    central_heat = True
                else:
                    central_heat = False

                #heat mechanism? heat mechanic??? not sure
                heat_mech = row[35]
                #skip row36, agent id (2)
                #skip row37, agent last name
                #skip row38 agent first name
                #skip row39 agent middle initial
                #skip row40, agent title
                #skip row41, business name

                #could be owner, could be agent
                ## owner_name = row[42]
                ## owner_address1 = row[43]
                ## owner_address2 = row[44]
                ## owner_city = row[45]
                ## owner_state = row[46]
                ## owner_zip = row[47]


                #address = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre, qualifier_post, apt_num])

                #this is causing problems with lookups in google
                if qualifier_pre == "DUP" or qualifier_pre == "DUPE" or qualifier_pre == "2-Jan" or qualifier_pre == "HM" or qualifier_pre == "DWN":
                    qualifier_pre = ''

                address_main = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre])
                address_main = address_main.strip()
                #get rid of any double spaces
                address_main = address_main.replace("  ", " ")

                #similar to conversions,
                #but there are too many of these to list there
                if re.search('HOLLY RIDGE LN', address_main):
                    address_main = address_main.replace('HOLLY RIDGE LN', 'HOLLYRIDGE LN')
                if re.search('BERKSHIRE CT', address_main):
                    address_main = address_main.replace('BERKSHIRE CT', 'BERKSHIRE')
                    #address_main = ''
                if re.search('CAMERON CT', address_main):
                    address_main = address_main.replace('CAMERON CT', 'CAMERON')
                    #address_main = ''
                if re.search('ATHENS CT', address_main):
                    address_main = address_main.replace('ATHENS CT', 'ATHENS')
                    #address_main = ''
                if re.search('LAMAR CT', address_main):
                    address_main = address_main.replace('LAMAR CT', 'LAMAR')
                    #address_main = ''
                if re.search('MONITEAU CT', address_main):
                    address_main = address_main.replace('MONITEAU CT', 'MONITEAU')
                    #address_main = ''
                if re.search('IMPERIAL CT', address_main):
                    address_main = ''
                if re.search('PERKINS DR', address_main):
                    address_main = ''
                if re.search('GRANITE OAKS CT', address_main):
                    address_main = ''

                    

                #sometimes the 'BLDG' data is added in the wrong place
                #then it gets treated as a unit item
                #(but it's not *always* a unit item, so can't generalize it that way)
                if qualifier_post == "BLDG" or qualifier_post == "LOT":
                    address_main = " ".join([address_main, qualifier_post, apt_main])
                    address_main = address_main.strip()
                    apt_main = ''
                else:
                    apt_main = " ".join([qualifier_post, apt_num])
                    apt_main = apt_main.strip()

                #check if this is one we want to skip
                if conversions.has_key(address_main.upper()):
                    address_main = conversions[address_main.upper()]

                if address_main:
                    print "APT_MAIN: ", apt_main
                    address = ", ".join( [address_main, apt_main] )


                ## if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']):

                print "Parcel ID:", parcel_id
                print address

                results = None

                #make sure it's not one we're skipping:
                if not address:
                    print "SKIPPING ITEM: %s" % row[1]
                    skips += 1

                    ## skips = codecs.open("skips.txt", 'a', encoding='utf-8')
                    ## original = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre])
                    ## skips.write(original)
                    ## skips.write('\n')
                    ## skips.close()
                    
                #check if we've started processing any results for this row
                elif not search_results.has_key(address.upper()):
                    print "No saved search results for address: %s" % address
                    print "Skipping."
                    print
                    #raise ValueError, "No results found for %s" % address

                else:
                    
                    print "Already had building: %s" % address
                    results = search_results[address.upper()]

                    assert results
                    #print results

                    lookup_building_with_geo(results, make=True, parcel_id=parcel_id)
                    #print results
                    #current['results'] = results

                    #print results

                    if results.errors:
                        print results
                        raise ValueError, results.errors
                    else:

                        bldg = results.building
                        assert bldg
                        unit = results.unit

                        #at this point there should be at least one unit
                        #and we will want to associate results with that unit
                        #assert unit
                        # can just pass this up in this case

                        if not unit:
                            print "Skipping address... no matching Unit!"

                        else:


                            #now that we have a building
                            #look up energy data on the remote website

                            #result = urllib2.urlopen("http://example.com/foo/bar")
                            #print result.read()

                            ## base = "http://www.gocolumbiamo.com/cfforms/ub/rental.html"
                            ## driver.get(base)
                            ## search = driver.find_element_by_css_selector('#address')
                            ## search.send_keys(address)
                            ## button = driver.find_element_by_css_selector('.ui-bar > a:nth-child(2)')
                            ## #button = driver.find_element_by_css_selector('#PrimaryCenterColumn > div > div.ui-bar-b.ui-header > div > a.ui-btn.ui-btn-corner-all.ui-shadow.ui-btn-down-b.ui-btn-up-b')
                            ## #button = driver.find_element_by_css_selector('#PrimaryCenterColumn > div > div.ui-bar-b.ui-header > div > a.ui-btn.ui-btn-corner-all.ui-shadow.ui-btn-down-b.ui-btn-up-b > span > span')
                            ## button.click()
                            ## time.sleep(4)

                            ## #results = driver.find_element_by_css_selector('.dojoxGridMasterView')
                            ## results = driver.find_element_by_css_selector('.dojoxGridContent > div:nth-child(1)')
                            ## print results.get_attribute('innerHTML')
                            ## print parcel_id

                            ## options = results.find_elements_by_tag_name('div')
                            ## #options = results.find_elements_by_link_text(parcel_id)
                            ## print options
                            ## #something didn't work with this:
                            ## #look_for = '<td tabindex="-1" role="gridcell" colspan="1" class="dojoxGridCell" idx="0" style="width:90px;">%s</td>' % parcel_id
                            ## look_for = '>%s<' % parcel_id

                            ## matches = []
                            ## for option in options:
                            ##     markup = option.get_attribute('innerHTML')
                            ##     #print markup
                            ##     if re.search(look_for, markup):
                            ##         matches.append(option)
                            ##         #print "MATCH!"

                            ## if len(matches) > 1:
                            ##     print matches
                            ##     raise ValueError, "Too many matches!"
                            ## else:
                            ##     matches[0].click()


                            #just realized that this form uses the property_id
                            #which we already have...
                            #can skip the steps above that are trying to make this link:

                            base = "http://www.gocolumbiamo.com/cfforms/ub/ubdata.cfm?LOCID=%s&AppNum=79" % parcel_id
                            driver.get(base)

                            try:
                                heat_source = driver.find_element_by_css_selector('#PrimaryCenterColumn > table:nth-child(1) > tbody:nth-child(1) > tr:nth-child(3) > td:nth-child(1) > strong:nth-child(1) > font:nth-child(1)')
                                if heat_source.text.strip() == "Heating Source: Gas Heat":
                                    bldg.heat_source_details = 'gas'
                                    bldg.save()
                                else:
                                    print heat_source.text
                                    exit()
                                    #TODO:
                                    bldg.heat_source_details = 'electric'
                                    bldg.who_pays_gas = 'not_available'
                            except:
                                print "heat source not found... skipping"
                                    
                            try:
                                selector = driver.find_element_by_css_selector('#el_table_length > label:nth-child(1) > select:nth-child(1) > option:nth-child(3)')
                                selector.click()
                            except:
                                print "No Water data available... skipping"
                            else:

                                body = driver.find_element_by_css_selector('#el_table > tbody:nth-child(3)')
                                rows = body.find_elements_by_tag_name('tr')
                                #row = rows[0]
                                query = bldg.utilitysummary_set.filter(type='electricity')
                                for row in rows:
                                    #print row.get_attribute('innerHTML')
                                    cols = row.find_elements_by_tag_name('td')
                                    date = cols[0].text + '-01'
                                    cost = cols[1].text.replace('$', '').strip()
                                    amount = cols[2].text
                                    amount = amount.replace(' KWH', '')
                                    update_summary(query, date, cost, amount, bldg, unit, provider, 'electricity', 'kwh')
                                    #update_summary(query, date, cost, amount)
                                    #for item in cols:
                                    #    print item.text


                            #print dir(bldg)
                            #print bldg.utilitysummary_set
                            #query = bldg.utilitysummary_set.filter(type=utility_type[0])
                            #could look up type from UTILITY_TYPES...
                            #but in this case we know what they should be
                            #query = bldg.utilitysummary_set.filter(type='water')
                            #if len(query):

                            try:
                                water = driver.find_element_by_css_selector('#ext-gen23')
                                water.click()

                                selector = driver.find_element_by_css_selector('#wr_table_length > label:nth-child(1) > select:nth-child(1) > option:nth-child(3)')
                                selector.click()
                            except:
                                print "No Water data available... skipping"
                            else:

                                body = driver.find_element_by_css_selector('#wr_table > tbody:nth-child(3)')

                                rows = body.find_elements_by_tag_name('tr')
                                #row = rows[0]
                                query = bldg.utilitysummary_set.filter(type='water')
                                for row in rows:
                                    #print row.get_attribute('innerHTML')
                                    cols = row.find_elements_by_tag_name('td')
                                    date = cols[0].text + '-01'
                                    cost = cols[1].text.replace('$', '').strip()
                                    amount = cols[2].text
                                    amount = amount.replace(' CCF', '')
                                    update_summary(query, date, cost, amount, bldg, unit, provider, 'water', 'ccf')
                                    #update_summary(query, date, cost, amount)
                                    #for item in cols:
                                    #    print item.text


                            unit.update_averages()

                            #see if we have enough info now to make a score:
                            unit.update_energy_score()

                            #now that we've saved the unit,
                            #update the averages for the whole building:
                            unit.building.update_utility_averages()
                            unit.building.update_rent_details()

                
                position += 1
                save_json(position_file, position)
        
            if any_updated:
                #back it up for later
                #enable this when downloading GPS coordinates...
                #the rest of the time it slows things down
                local_cache['buildings'] = {}
                for key, value in search_results.items():
                    #search_results[key] = SearchResults().from_dict(value)
                    local_cache['buildings'][key] = value.to_dict()
                save_json(cache_destination, local_cache)

                position = count
                save_json(position_file, position)
                exit()

            end = datetime.now()
            print "finished: ", end
            total_time = end - start
            print total_time

            print
Example #44
0
# This code computes the bleu score for the candidate sentence

import nltk
import math
from nltk import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.bleu_score import SmoothingFunction
import json

from helpers import load_json
#from NN_architecture import generate_caption
# run the NN architecture before
captions_test = load_json('captions_test_saifullah')

##generate_caption(path+filenames_test[0])
##with open('generated_captions_VGG19.txt') as inFile:
#with open('captions_vgg16/4_generated_captions_VGG16.txt') as inFile:
#    generated_test_captions=inFile.readlines()
#for i in range(len(generated_test_captions)):
##    THIS LINE REMOVES THE FIRST EMPTY SPACE
##    generated_test_captions[i]=generated_test_captions[i][1:]
#    generated_test_captions[i]=generated_test_captions[i].replace('\n','')
#

# load from json
with open('InceptionCaptions/9_greedy.json') as inFile:
    generated_test_captions = json.load(inFile)
#c_to_insert=generated_test_captions[883]
#generated_test_captions.insert(884,c_to_insert)
Example #45
0
from collections import OrderedDict
from rwrs import app
import helpers


MAPS = helpers.load_json(app.config['MAPS_DATA_FILE'])
RANKS = helpers.load_json(app.config['RANKS_DATA_FILE'])

SQUADMATES_STEPS_XP = 1000 # One squad mate is gained every 1000 XP
MAX_SQUADMATES = 10 # Maximum squad mates allowed

UNLOCKABLES = {
    'vanilla': OrderedDict([
        (0, {
            'weapons': [
                {'image': 'assault_rifles', 'name': 'Assault rifles'},
                {'image': 'shotguns', 'name': 'Shotguns'}
            ],
            'equipment': [
                {'image': 'riot_shield', 'name': 'Riot shield'}
            ],
            'throwables': [
                {'image': 'hand_stun_grenades', 'name': '2 hand/stun grenades'}
            ]
        }),
        (500, {
            'weapons': [
                {'image': 'bazooka', 'name': 'Bazooka'},
                {'image': 'pistols_sd', 'name': 'Silenced pistols'}
            ],
            'equipment': [
Example #46
0
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true,
                                                          logits=y_pred)

    # Keras may reduce this across the first axis (the batch)
    # but the semantics are unclear, so to be sure we use
    # the loss across the entire 2-rank tensor, we reduce it
    # to a single scalar with the mean function.
    loss_mean = tf.reduce_mean(loss)

    return loss_mean


# recreate the tokenizer
mark_start='ssss '
mark_end=' eeee'
captions_train=load_json('captions_train')
captions_train_marked=mark_captions(captions_train)
captions_train_flat=flatten(captions_train_marked)
tokenizer=TokenizerWrap(texts=captions_train_flat,
                        num_words=2000)
token_start=tokenizer.word_index[mark_start.strip()]
token_end=tokenizer.word_index[mark_end.strip()]
tokens_train=tokenizer.captions_to_tokens(captions_train_marked)

filenames_val=load_json('filenames_val')


def generate_caption(image_path, max_tokens=30):
    """
    Generate a caption for the image in the given path.
    The caption is limited to the given number of tokens (words).