Exemple #1
def method_new(name="Untitled Q-Method", owner="Your Name", email="email", phone='phone', notes=''):
    #look for existing methods:
    options = os.listdir(data_path)
    new_option = ""
    #make sure that:
    #a) we have a new id and
    #b) the new id has not already been used
    while (not new_option) or (new_option in options):
        new_option = generate_id()

    #make new directory in method_path
    method_path = os.path.join(data_path, new_option)
    if not os.path.exists(method_path):
        #This should never happen with above while loop, but just in case...
        raise ValueError, "Path exists, but it shouldn't: %s" % method_path
    #make an empty configuration file
    config = os.path.join(method_path, "config.json")
    result = load_json(config, create=True)
    result['name'] = name
    result['owner'] = owner
    result['email'] = email
    result['phone'] = phone
    result['notes'] = notes
    result['statements'] = """1. First sample statement
2. Second sample statement"""
    result['columns'] = '2 3 5 6 8 6 5 3 2'

    save_json(config, result)
    #redirect to the new method's page:        
    redirect("/method/" + new_option + "/bookmark/")
Exemple #2
def get_champion_id_dict(SECRET_API_KEY=None):
    dct = helpers.load_json(".cache/champ_ids.json")
    if not dct or time.time() > dct["expiry_time"]:
        if not SECRET_API_KEY:
            SECRET_API_KEY = read_api_key()
        url = BASE + "/lol/static-data/v3/champions?locale=en_US&dataById=true"
        headers = {"X-Riot-Token": SECRET_API_KEY}
        r = requests.get(url, headers=headers)
        if r.status_code == 429:
            t = r.json()["Retry-After"]
            print("Waiting " + str(t) + " seconds and trying again")
            print("Full response:")
            return get_match_from_id(matchid, SECRET_API_KEY)
        if r.status_code != 200:
            print("Get champion id dict failed")
            return r
        dct = {}
        data = r.json()["data"]
        for champ_id in data:
            dct[champ_id] = data[champ_id]["name"]
        dct["expiry_time"] = time.time(
        ) + 60 * 60 * 24 * 30  # Expires after one month
        if not helpers.store_json(dct, ".cache/champ_ids.json", True):
            return None
        return dct
        return dct
def update_json(source, city_tag):
    cache_file = "%s.json" % city_tag
    cache_destination = os.path.join(os.path.dirname(source), cache_file)

    local_cache = load_json(cache_destination, create=True)

    assert local_cache.has_key('buildings')
    assert local_cache.has_key('parcels')

    locations = {}
    for key, value in local_cache['buildings'].items():
        location = Location(value)

        for source in location.sources:
            if hasattr(location, source):
                result = getattr(location, source)
                #convert from old dict format here
                if isinstance(result, dict):
                    print "Found dictionary in: %s for: %s" % (
                        source, location.address)

                    result = [result]
                    setattr(location, source, result)

        locations[key] = location

    #back it up for later
    #enable this when downloading GPS coordinates...
    #the rest of the time it slows things down
    local_cache['buildings'] = {}
    for key, value in locations.items():
        local_cache['buildings'][key] = value.to_dict()
    save_json(cache_destination, local_cache)
Exemple #4
	def update_state_json(self, content):
			cstate = helpers.load_json( content)[ 'cluster']
			self.update_state( cstate)
		except (ValueError, KeyError):
			return False
		return True
Exemple #5
 def update_state_json(self, content):
         cstate = helpers.load_json(content)['cluster']
     except (ValueError, KeyError):
         return False
     return True
Exemple #6
def get_account_id(summoner_name, SECRET_API_KEY=None):
    dct = helpers.load_json(".cache/summoners/" + summoner_name + ".json")
    if not dct or time.time() > dct["expiry_time"]:
        if not SECRET_API_KEY:
            SECRET_API_KEY = read_api_key()
        if not summoner_name:
            raise TypeError("Summoner name cannot be None")
        if not isinstance(summoner_name, str):
            raise TypeError("Summoner name must be a string")
        url = BASE + "/lol/summoner/v3/summoners/by-name/" + summoner_name
        headers = {"X-Riot-Token": SECRET_API_KEY}
        r = requests.get(url, headers=headers)
        if r.status_code == 429:
            t = r.json()["Retry-After"]
            print("Waiting " + str(t) + " seconds and trying again")
            print("Full response:")
            return get_match_from_id(matchid, SECRET_API_KEY)
        if r.status_code != 200:
            print("Get account id failed")
            return r
        response = r.json()
        response["expiry_time"] = time.time(
        ) + 60 * 60 * 24 * 30  # Expires after one month
                           ".cache/summoners/" + summoner_name + ".json", True)
        return response["accountId"]
        return dct["accountId"]
Exemple #8
def get_match_from_id(matchid, SECRET_API_KEY=None):
    match = helpers.load_json(".match_cache/" + str(matchid) + ".json")
    if not match:
        if not SECRET_API_KEY:
            SECRET_API_KEY = read_api_key()
        if not matchid:
            raise TypeError("Match id cannot be None")
        if not isinstance(matchid, int):
            raise TypeError("Match id must be an int")
        url = BASE + "/lol/match/v3/matches/" + str(matchid)
        headers = {"X-Riot-Token": SECRET_API_KEY}
        r = requests.get(url, headers=headers)
        if r.status_code == 429:
            print("\n" * 5)
            print("\n" * 5)
                t = r.json()["Retry-After"]
                t = 180
            print("Waiting " + str(t) + " seconds and trying again")
            print("Full response:")
            return get_match_from_id(matchid, SECRET_API_KEY)
        if r.status_code != 200:
            print("Get match object failed")
            return r
        match = r.json()
        helpers.store_json(match, ".match_cache/" + str(matchid) + ".json",
        return match
        return match
def check_resume_success(
    nlp, args, source_file, last_shard, output_path, split, compression
    logger.info("Checking if resume was successful...")
    chunk_file_path_str = split + "." + str(last_shard - 1) + ".json"
    if compression:
        chunk_file_path_str += ".gz"
    chunk_file_path = os.path.join(output_path, chunk_file_path_str)

    line_source = source_file.readline().strip()

    line_source_tokenized = next(tokenize(nlp, [line_source]))

    # Apply preprocessing on the line
    preprocessed_line = preprocess(
        [1] * len(line_source_tokenized),

        chunk_json, _ = load_json(chunk_file_path)
    except FileNotFoundError:
            "The file at path %s was not found. Make sure `--compression` is set correctly.",
    last_item_chunk = chunk_json[-1]
    line_chunk = last_item_chunk["src"]

    # remove the last item if it is a newline
    if line_chunk[-1] == ["\n"]:

    if line_chunk == preprocessed_line:
        logger.info("Resume Successful!")
        logger.debug("`source_file` moved forward one line")
        logger.info("Resume NOT Successful")
        logger.info("Last Chunk Line: %s", line_chunk)
        logger.info("Previous (to resume line) Source Line: %s", preprocessed_line)
        # skipcq: PYL-W1201
                "Common causes of this issue:\n"
                + "1. You changed the `--shard_interval`. You used a different interval previously than you used in the command to resume.\n"
                + "2. The abstractive (`.source` and `.target`) or extractive (`.json`) dataset files were modified or removed. The last `.json` file needs to be in the same folder it was originally outputted to so the last shard index and be determined and the last line can be read.\n"
                + "3. It is entirely possible that there is a bug in this script. If you have checked that the above were not the cause and that there were no issues pertaining to your dataset then open an issue at https://github.com/HHousen/TransformerSum/issues/new."
        return False

    return True
def test_mingrate_pids(app, location, datadir):
    """Test migrate pids."""
    data = load_json(datadir, 'cds_records_demo_1_project.json')
    dump = CDSRecordDump(data=data[0])
    record = CDSRecordDumpLoader.create(dump=dump)

    pids = [pid.pid_value for pid in
    expected = sorted(['2093596', 'CERN-MOVIE-2012-193'])
    assert sorted(pids) == expected
Exemple #11
def _load_from_file( filename):
		with open( filename, 'r') as f:
			content = helpers.load_json( f.read())
	except (IOError, ValueError, KeyError) as e:
		_logger.warn( 'Cannot read file %s with json configuration %s', filename, e)
		return False

	r = _load_from_content( content)
	if r:
		_logger.info( 'Loading %s', filename)
	return r
Exemple #12
def configure():
    #look for existing methods:
    options = os.listdir(data_path)
    results = {}
    for option in options:
        method_path = os.path.join(data_path, option)
        config = os.path.join(method_path, "config.json")
        if os.path.exists(config):
            result = load_json(config)
            results[option] = result
    return template('configure', options=results)
Exemple #13
def _load_from_file(filename):
        with open(filename, 'r') as f:
            content = helpers.load_json(f.read())
    except (IOError, ValueError, KeyError) as e:
        _logger.warn('Cannot read file %s with json configuration %s',
                     filename, e)
        return False

    r = _load_from_content(content)
    if r:
        _logger.info('Loading %s', filename)
    return r
Exemple #14
def subject_new(key):
    create a new subject for the Q-Method specified by key
    method_path = os.path.join(data_path, key)
    if not os.path.exists(method_path):
        return template('404', key=key, item="method")
        #look for existing subjects:
        options = os.listdir(method_path)
        new_option = ""
        #make sure that:
        #a) we have a new id and
        #b) the new id has not already been used
        while (not new_option) or (new_option in options):
            new_option = generate_id()

        #make new directory in method_path
        subject_path = os.path.join(method_path, new_option)
        if not os.path.exists(subject_path):
            #This should not ever happen with above check, but just in case...
            raise ValueError, "Subject path exists, but it shouldn't: %s" % subject_path

        #make an empty configuration file
        config = os.path.join(subject_path, "subject_config.json")
        result = load_json(config, create=True)
        #once the subject starts sorting, we will cache this locally
        #based on the current state of the method configuration
        #result['statements'] = ""
        result['columns'] = u""
        result['json'] = u""
        result['started'] = u""
        #a textual representation of where each statement is
        result['state'] = u""
        result['history'] = u""
        #is it finished? complete? this will prevent further changes:
        result['locked'] = False
        now = datetime.now()
        result['created'] = now.strftime("%Y.%m.%d %H:%M:%S")

        # after first movement
        result['started'] = u""
        result['last_update'] = u""

        save_json(config, result)

        #redirect to the new method's page:        
        redirect("/method/" + key + "/")
Exemple #15
def main():

    data = helpers.load_json("data/states.json")

    if not isinstance(data, dict):
        data = { x["full_name"]:x for x in data }

    key = "marriage_age"

    new = {}
    lines = helpers.read_lines("entry.txt")
    lines = [ x for x in lines if x ]
#    lines = lines[::4]

    for line in lines:

        line = line.split(". ")[-1]
        name, num = line.split(": ", 1)
        new[name] = float(num)

            name = line.split("\t")[0]
            name = name.split("(")[0].strip()
            new[name] = float(line.split("\t")[1].replace(",", ""))
        except Exception:

    [ print(k, ":", v) for k, v in new.items() ]

    for name, val in new.items():
        if name not in data:
            data[name] = {}
        data[name][key] = val

    # Clean up the data
    cleaned = {}
    for k, v in data.items():
        key = rmchars(k, ".")
        key = key.replace("Saint", "St")
        if key in cleaned:
            cleaned[key] = v
        cleaned[key]["name"] = key

    return helpers.dump_json(cleaned, "foo.json")
def bulk_generation():
#    generated_captions=list()
    for i in range(num_test_images):
        if i==884:
#            image 884 (square_40) is corrupted
#        generated_captions.append(C)
        print(i, "Progress: %.2f" % progress)
Exemple #17
def post_method_json(key=None):
    #print dir(request.forms)
    #print request.forms.keys()
    method_path = os.path.join(data_path, key)
    if not os.path.exists(method_path):
        return template('404', key=key, item="method")
        config = os.path.join(method_path, "config.json")
        result = load_json(config)

        changed = False
        for key in request.forms.keys():
            #special case for 'statements' key...
            #want to get rid of any extra newline characters
            #this will help calculate the number of statements more accurately
            #(rather than stripping newlines everywhere we look at statements)
            #this works here, but it will make it difficult to provide
            #feedback to the user about how many statements there are
            #compared to how many spaces there are available in columns
            #adding a similar check in method.js
            if key == "statements":
                text = request.forms.get(key)
                lines = text.splitlines()
                new_lines = []
                for line in lines:
                    if line:
                value = '\n'.join(new_lines)
                value = request.forms.get(key)
            if value != result[key]:
                #print "%s (original) != %s (new)" % (result[key], request.forms.get(key))

                result[key] = value
                changed = True

        if changed:
            #print "METHOD CONFIG CHANGED!!!! (saving)"
            save_json(config, result)
        return template('success')
Exemple #18
def get_matches_for_tcode(tcode):
	path = ".cache/tournament_matches/" + tcode + ".json"
		matches = helpers.load_json(path)
		print("Failed to load path")
	miss = True
	if matches is not None:
		file_creation = os.path.getmtime(path)
		expiry_time = 3600 * 24 * 2
		miss = (matches == [] and time.time() > file_creation + expiry_time)
	if miss:
		print("Waiting due to cache miss")
		endpoint = "/lol/match/v3/matches/by-tournament-code/"+str(tcode)+"/ids"
		api_key = fetch_api_key()
		headers = {"X-Riot-Token": api_key}
		url = M_BASE + endpoint
		r = requests.get(url, headers=headers)
		if r.status_code == 404:
			print("Tcode " + str(tcode) + " does not have any games associated with it.")
			matches = []
			helpers.store_json(matches, path, True)
			return matches
		elif r.status_code == 429:
			print("Hit a retry-after when getting tournament matches. Exiting")
		elif r.status_code != 200:
			print("Failed to get matches for tcode " + str(tcode))
			return r
		matches = []
		for match_id in r.json():
			match = get_tournament_match(match_id, tcode)
			if match:
		helpers.store_json(matches, path, True)
	return matches
Exemple #19
def get_recent_history(accountid, SECRET_API_KEY=None):
    hist = helpers.load_json(".cache/recent_histories/" + str(accountid) +
    if not hist or time.time() > hist["expiry_time"]:
        if not SECRET_API_KEY:
            SECRET_API_KEY = read_api_key()
        if not accountid:
            raise TypeError("Account id cannot be None")
        if not isinstance(accountid, int):
            raise TypeError("Account id must be an int")
        url = BASE + "/lol/match/v3/matchlists/by-account/" + str(
            accountid) + "/recent"
        headers = {"X-Riot-Token": SECRET_API_KEY}
        r = requests.get(url, headers=headers)
        if r.status_code == 429:
            t = r.json()["Retry-After"]
            print("Waiting " + str(t) + " seconds and trying again")
            print("Full response:")
            return get_match_from_id(matchid, SECRET_API_KEY)
        if r.status_code != 200:
            print("Get match history failed")
            return r
        response = r.json()
        if "matches" not in response:
            print("There is no recent match history")
            return []
        hist = {
            "expiry_time": time.time() * 60 * 60 * 2,
            "matches": response["matches"]
        helpers.store_json(hist, ".cache/recent_histories/" + str(accountid) +
                           ".json", True)  # Expires after 2 hours
        return hist["matches"]
        return hist["matches"]
Exemple #20
def test_subformat_creation_if_missing(api_app, location, datadir, es, users):
    """Test subformat creation if missing."""
    # [[ migrate the video ]]
    migration_streams = get_migration_streams(datadir=datadir)
    data = load_json(datadir, 'cds_records_demo_1_video.json')
    dump = CDSRecordDump(data=data[0])
    with mock.patch.object(DataCiteProvider, 'register'), \
            mock.patch.object(CDSRecordDumpLoader, '_create_frame',
                              side_effect=get_frames), \
            mock.patch.object(ExtractFramesTask, '_create_gif'), \
            mock.patch.object(CDSRecordDumpLoader, '_clean_file_list'), \
                CDSRecordDumpLoader, '_get_migration_file_stream_and_size',
        video = CDSRecordDumpLoader.create(dump=dump)

    with mock.patch.object(TranscodeVideoTask, 'run') as mock_transcode:
        deposit = deposit_video_resolver(video['_deposit']['id'])
        deposit_id = deposit.id
        # simulate the missing of a subformat
        del deposit['_files'][0]['subformat'][0]
        assert len(deposit['_files'][0]['subformat']) == 4
        #  recreate 240p format
            record=video, deposit=deposit)
        # check subformats
        deposit = Video.get_record(deposit_id)
        rec_video = record_resolver.resolve(video['recid'])[1]
        #  rec_video = record_resolver.resolve(video['recid'])[1]
        assert len(deposit['_files'][0]['subformat']) == 5
        assert len(rec_video['_files'][0]['subformat']) == 5
        # check if transcoding is called properly
        assert mock_transcode.called is True
        [(_, call_args)] = mock_transcode.call_args_list
        assert call_args == {'preset_quality': '240p'}
def read_csv(source_csv):
    city_options = City.objects.filter(tag="bloomington_in")
    print "Number of cities available: %s" % len(city_options)
    if not len(city_options):
        raise ValueError, "CITY NOT FOUND! run make_cities.py first"
        ## city = City()
        ## city.name = "Bloomington"
        ## city.tag = to_tag(city.name)
        ## city.save()
        city = city_options[0]

    print city

    feed_date = "2013-08-29"

    feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date)
    if feeds.exists():
        feed = feeds[0]
        print "Already had feed: %s, %s" % (feed.city, feed.added)
        feed = FeedInfo()
        feed.city = city
        feed.added = feed_date
        feed.version = "0.1"
        print "Created new feed: %s" % feed.city

    people = Person.objects.filter(name="Blank")
    if people.exists():
        person = people[0]
        print "Already had person: %s" % (person.name)
        person = Person()
        person.name = "Blank"
        print "Created new person: %s" % person.name

    sources = Source.objects.filter(feed=feed)
    if sources.exists():
        feed_source = sources[0]
        print "Already had source: %s, %s" % (feed_source.feed.city, feed_source.feed.added)
        feed_source = Source()
        feed_source.feed = feed
        feed_source.person = person
        print "Created new source: %s" % feed_source.feed.city

    cache_file = "%s.json" % city.tag
    cache_destination = os.path.join(os.path.dirname(source_csv), cache_file)
    # keep a local copy of data we've processed...
    # this should help with subsequent calls
    # to make sure we don't need to duplicate calls to remote geolocation APIs:
    local_cache = load_json(cache_destination, create=True)
    if not local_cache.has_key("buildings"):
        local_cache["buildings"] = {}
    if not local_cache.has_key("parcels"):
        local_cache["parcels"] = {}

    locations = {}
    for key, value in local_cache["buildings"].items():
        locations[key] = Location(value)

    # geocoder helper:
    geo = Geo()

    skips = 0
    with codecs.open(source_csv, "rb", encoding="utf-8") as csvfile:
        # reader = csv.reader(csvfile, delimiter=' ', quotechar='|')
        reader = csv.reader(csvfile)

        # just print the first row:
        print ">, <".join(reader.next())

        count = 0
        for row in reader:
            count += 1
            print "Looking at row: %s" % count

            # could exit out early here, if needed
            if count > 1000:
                # exit()

            bldg_id = row[0]
            print bldg_id

            address = row[1]
            print address

            owner = row[2]

            # skip this:
            ownder_contact = row[3]

            agent = row[4]

            bldg_units = row[9]
            print bldg_units

            units_bdrms = row[10]
            print units_bdrms

            # check if this is one we want to skip
            if conversions.has_key(address.upper()):
                address = conversions[address.upper()]

            # make sure it's not one we're skipping:
            if not address:
                print "SKIPPING ITEM: %s" % row[1]
                skips += 1
                if locations.has_key(address.upper()):
                    location = locations[address.upper()]
                    location = Location()

                # temporarily just want to look at google again
                location.sources = ["google"]

                # do some geocoding, as needed:
                search = "%s, Bloomington IN" % address.upper()

                any_updated = False
                for geo_source in location.sources:
                    update = geo.lookup(search, geo_source, location, force=True)
                    if update:
                        any_updated = True

                location.sources = ["google", "bing", "usgeo", "geonames", "openmq", "mq"]

                if not hasattr(location, "address_alt") or not location.address_alt:
                    any_updated = True

                location.address_alt = search
                location.bldg_units = bldg_units
                location.units_bdrms = units_bdrms
                locations[address.upper()] = location

                # handle the database storage
                bldg = make_building(location, bldg_id, city, feed_source)

                # owner_details = parse_person(owner)
                if owner:
                    result = special_cases(owner)
                    if result:
                        (owner_name, owner_address) = result
                        (owner_name, owner_address, owner_phone, remainder) = parse_person(owner)
                        ## print "owner name: %s" % owner_name
                        ## print "owner address: %s" % owner_address
                        ## print ""

                        if owner_name:
                            (person, bldg_person) = make_person(owner_name, bldg, "Owner", address=owner_address)

                if agent and agent != "No Agent":
                    # agent_details = parse_person(agent)
                    (agent_name, agent_address, agent_phone, remainder) = parse_person(agent)
                    ## print "agent name: %s" % agent_name
                    ## print "agent address: %s" % agent_address
                    ## print ""

                    if agent_name:
                        (person, bldg_person) = make_person(agent_name, bldg, "Agent", address=agent_address, city=city)

                if any_updated:
                    # back it up for later
                    # enable this when downloading GPS coordinates...
                    # the rest of the time it slows things down
                    local_cache["buildings"] = {}
                    for key, value in locations.items():
                        local_cache["buildings"][key] = value.to_dict()
                    save_json(cache_destination, local_cache)


    save_results(locations, "bloomington-filtered.tsv")
Exemple #22
from nltk import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction


image_dir = '../../Desktop/parsingDataset/RSICD_images/'

inception_tv_train = np.load(
inception_tv_test = np.load(

captions_train = load_json('captions_train')

def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

chencherry = SmoothingFunction()

def bleu(reference, candidate, grade=1):
    reference_tokenized = word_tokenize(reference)
    reference_list = list()
def read_csv(source):
    #for reading unicode
    #f = codecs.open(source, 'r', encoding='utf-8')

    city_options = City.objects.filter(tag="ann_arbor")
    print len(city_options)
    if not len(city_options):
        city = City()
        city.name = "Ann Arbor"
        city.tag = to_tag(city.name)
        city = city_options[0]

    print city

    #setup FeedInfo item
    #and also create a Source item

    permit_sub_types = []
    status_types = []
    building_nums = []
    applicants = []
    managers = []

    cache_file = "%s.json" % city.tag
    cache_destination = os.path.join(os.path.dirname(source), cache_file)
    #keep a local copy of data we've processed...
    #this should help with subsequent calls
    #to make sure we don't need to duplicate calls to remote geolocation APIs:
    local_cache = load_json(cache_destination, create=True)
    if not local_cache.has_key('buildings'):
        local_cache['buildings'] = {}
    if not local_cache.has_key('parcels'):
        local_cache['parcels'] = {}
    locations = {}
    for key, value in local_cache['buildings'].items():
        locations[key] = Location(value)

    #geocoder helper:
    geo = Geo()
    #with open('eggs.csv', 'rb') as csvfile:
    with codecs.open(source, 'rb', encoding='utf-8') as csvfile:
        #reader = csv.reader(csvfile, delimiter=' ', quotechar='|')
        reader = csv.reader(csvfile)

        #just print the first row:
        print '>, <'.join(reader.next())

        count = 0
        for row in reader:
            count += 1
            #could exit out early here, if needed
            if count > 10:

            print row
            #type of building (eg: sf attached, duplex, etc)
            permit_id = row[0]

            #should always be "RENTAL" (don't need to track this one)
            permit_type = row[1]
            if not permit_type == "RENTAL" and not permit_type == "MECHANICAL":
                raise ValueError, "Unexpected permit type: %s in row: %s" % (
                    permit_type, row)
            sub_type = row[2]
            #can use this to filter out non-rental or obsolete entries
            #don't need to track otherwise:
            status = row[3]
            parcel_id = row[4]
            address = row[5]

            #should be fixed per source:
            city = row[6]
            if not ( (city.lower() == 'ann arbor') or (city == '') ):
                raise ValueError, "Unexpected city: %s" % (city)

            sqft = row[7]
            number_of_buildings = row[8]
            applicant_name = row[9]
            number_of_stories = row[10]
            number_of_units = row[11]
            if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']):
                #check if we've started processing any results for this row
                #if local_cache['buildings'].has_key(address.upper()):
                #    local_cache_cur = local_cache['buildings'][address.upper()]
                #    local_cache_cur = {}

                if locations.has_key(address.upper()):
                    location = locations[address.upper()]
                    location = Location()

                #do some geocoding, as needed:
                search = "%s, Ann Arbor MI" % address.upper()

                for source in location.sources:
                    geo.lookup(search, source, location)

                location.address_alt = search

                locations[address.upper()] = location

                #local_cache['buildings'][address.upper()] = local_cache_cur

                #and check if a previous building object in the db exists
                #cur_building = Building()
                bldg = Building()
                bldg.type = sub_type

            #back it up for later
            local_cache['buildings'] = {}
            for key, value in locations.items():
                local_cache['buildings'][key] = value.to_dict()
            save_json(cache_destination, local_cache)

            #(to see what data is available)

            if not status in status_types:
                #print "adding: %s" % sub_type

            if not sub_type in permit_sub_types:
                #print "adding: %s" % sub_type

            building_num = row[8]
            if not building_num in building_nums:
                #print "adding: %s" % sub_type

            applicant = row[9]
            if ( re.search('MGMT', applicant) or
                 re.search('REALTY', applicant) or 
                 re.search('PROPERTIES', applicant) or 
                 re.search('MANAGEMENT', applicant) or 
                 re.search('GROUP', applicant) or 
                 re.search('LLC', applicant) or 
                 re.search('L.L.C.', applicant) or 
                 re.search('INC', applicant)
                if not applicant in managers:
                if not applicant in applicants:

            #print ', '.join(row)

    ## print permit_sub_types
    print status_types
    print building_nums

def read_csv(source_csv, city_name, city_tag):
    city_options = City.objects.filter(tag=city_tag)
    print "Number of cities available: %s" % len(city_options)
    if not len(city_options):
        raise ValueError, "CITY NOT FOUND! run make_cities.py first"
        ## city = City()
        ## city.name = city_name
        ## city.tag = to_tag(city.name)
        ## city.save()
        city = city_options[0]

    print city

    feed_date = "2013-07-31"

    feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date)
    if feeds.exists():
        feed = feeds[0]
        print "Already had feed: %s, %s" % (feed.city, feed.added)
        feed = FeedInfo()
        feed.city = city
        feed.added = feed_date
        feed.version = "0.1"
        print "Created new feed: %s" % feed.city.name

    people = Person.objects.filter(name="Blank")
    if people.exists():
        person = people[0]
        print "Already had person: %s" % (person.name)
        person = Person()
        person.name = "Blank"
        print "Created new person: %s" % person.name

    sources = Source.objects.filter(feed=feed)
    if sources.exists():
        feed_source = sources[0]
        print "Already had source: %s, %s" % (feed_source.feed.city, feed_source.feed.added)
        feed_source = Source()
        feed_source.feed = feed
        feed_source.person = person
        print "Created new source: %s" % feed_source.feed.city.name

    cache_file = "%s.json" % city.tag
    cache_destination = os.path.join(os.path.dirname(source_csv), cache_file)
    #keep a local copy of data we've processed...
    #this should help with subsequent calls
    #to make sure we don't need to duplicate calls to remote geolocation APIs:
    local_cache = load_json(cache_destination, create=True)
    if not local_cache.has_key('buildings'):
        local_cache['buildings'] = {}
    if not local_cache.has_key('parcels'):
        local_cache['parcels'] = {}
    locations = {}
    for key, value in local_cache['buildings'].items():
        locations[key] = Location(value)

    #geocoder helper:
    geo = Geo()

    skips = 0
    #with codecs.open(source_csv, 'rb', encoding='utf-8') as csvfile:
    with open(source_csv) as csvfile:
        #reader = csv.reader(csvfile, delimiter=' ', quotechar='|')
        #reader = csv.reader(csvfile)
        #reader = unicodecsv.UnicodeReader(csvfile, encoding='utf-8')

        reader = unicode_csv_reader(csvfile)

        #just print the first row:
        print '>, <'.join(reader.next())

        count = 0
        for row in reader:
            count += 1
            print "Looking at row: %s" % count
            #could exit out early here, if needed
            if count > 1000:
            address = row[0]

            #need to fix the number being at the end of the address
            parts = address.split(',')
            anumber = parts[-1]
            parts = parts[:-1]
            street = ",".join(parts)
            address = "%s %s" % (anumber, street)

            invoice_number = row[1]
            bldg_id = row[1]
            print bldg_id

            #this is where owner is stored
            invoice_note = row[6]
            print invoice_note
            if re.match('Sent to:', invoice_note):
                print "changing invoice note from: %s" % invoice_note
                invoice_note = invoice_note[8:]
                print "to: %s" % invoice_note
                #raise ValueError, "invoice note does not start with Sent to"
                print "!!!!!invoice note does not start with Sent to!!!!!"
                print ""
                print ""

            no_units = row[12]
            ## #should always be "RENTAL" (don't need to track this one)
            ## permit_type = row[1]
            ## if not permit_type == "RENTAL" and not permit_type == "MECHANICAL":
            ##     raise ValueError, "Unexpected permit type: %s in row: %s" % (
            ##         permit_type, row)
            ## bldg_type = row[2]
            ## #can use this to filter out non-rental or obsolete entries
            ## #don't need to track otherwise:
            ## status = row[3]
            ## parcel_id = row[4]

            ## #should be fixed per source:
            ## ss_city = row[6]

            ## bldg_sf = row[7]
            ## no_bldgs = row[8]
            ## applicant_name = row[9]
            ## no_stories = row[10]
            ## no_units = row[11]

            ## if not ( (ss_city.lower() == city_name.lower()) or (ss_city == '') ):
            ##     raise ValueError, "Unexpected city: %s" % (ss_city)

            ## sqft = row[7]
            ## number_of_buildings = row[8]
            ## applicant_name = row[9]
            ## number_of_stories = row[10]
            ## number_of_units = row[11]
            #check if this is one we want to skip
            if conversions.has_key(address.upper()):
                address = conversions[address.upper()]

            ## if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']):

            #make sure it's not one we're skipping:
            if not address:
                print "SKIPPING ITEM: %s" % row[1]
                skips += 1
                #check if we've started processing any results for this row
                if locations.has_key(address.upper()):
                    location = locations[address.upper()]
                    location = Location()

            #temporarily just want to look at google again
            #location.sources = ["google"]
            #location.sources = ["google", "bing", "usgeo", "geonames", "openmq"]
            location.sources = ["google", "bing"]

            #do some geocoding, as needed:
            search = "%s, %s, %s" % (address.upper(), city_name, city.state)

            any_updated = False
            for geo_source in location.sources:
                update = geo.lookup(search, geo_source, location, force=True)
                #update = geo.lookup(search, geo_source, location, force=False)
                if update:
                    any_updated = True

            location.sources = ["google", "bing", "usgeo", "geonames", "openmq", "mq"]

            #this is the case for brand new searches
            #(which are updated in a different sense)
            if not hasattr(location, "address_alt") or not location.address_alt:
                any_updated = True

            location.address_alt = search
            #location.bldg_units = bldg_units
            #location.units_bdrms = units_bdrms
            locations[address.upper()] = location

            #handle the database storage
            bldg = make_building(location, bldg_id, city, feed_source, no_units=no_units)

            if invoice_note:
                (person, bldg_person) = make_person(invoice_note, bldg, "Permit Applicant")

            if any_updated:
                #back it up for later
                #enable this when downloading GPS coordinates...
                #the rest of the time it slows things down
                local_cache['buildings'] = {}
                for key, value in locations.items():
                    local_cache['buildings'][key] = value.to_dict()
                save_json(cache_destination, local_cache)


    destination = '%s.tsv' % city_tag
    save_results(locations, destination)
Exemple #27
    def input_data(self):
        if not self._input_data:
            self._input_data = load_json(self._inputfile)

        return self._input_data
Exemple #28
def create_table_command(json_file='table_columns.json',
    col_data = load_json(json_file)
    ct = CreateTable(table_name, col_data)
    return ct.parse_to_create_table_command()
def read_csv(source_csv, city_tag, feed_date):
    #could also use city.models.find_by_city_state
    city_options = City.objects.filter(tag=city_tag)
    #print "Number of cities available: %s" % len(city_options)
    if not len(city_options):
        raise ValueError, "CITY NOT FOUND! run make_cities.py first"
        city = city_options[0]

    print city

    feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date)
    if feeds.exists():
        feed = feeds[0]
        print "Already had feed: %s, %s" % (feed.city, feed.added)
        feed = FeedInfo()
        feed.city = city
        feed.added = feed_date
        feed.version = "0.1"
        print "Created new feed: %s" % feed.city.name

    people = Person.objects.filter(name="Blank")
    if people.exists():
        person = people[0]
        print "Already had person: %s" % (person.name)
        person = Person()
        person.name = "Blank"
        print "Created new person: %s" % person.name

    sources = Source.objects.filter(feed=feed)
    if sources.exists():
        feed_source = sources[0]
        print "Already had source: %s, %s" % (feed_source.feed.city, feed_source.feed.added)
        feed_source = Source()
        feed_source.feed = feed
        feed_source.person = person
        print "Created new source: %s" % feed_source.feed.city.name

    # ideally, should be able to use the database itself as the cache,
    # instead of using a local file
    # but it's also good to not have to repeat geo queries if going in bulk
    # the site code *will* make geo queries
    # so it's still a good idea to cache the coded address locally
    # even if using the site code for everything else.
    cache_file = "%s.json" % city.tag
    #print cache_file
    cache_destination = os.path.join(os.path.dirname(source_csv), cache_file)
    print cache_destination
    #keep a local copy of data we've processed...
    #this should help with subsequent calls
    #to make sure we don't need to duplicate calls to remote geolocation APIs:
    loaded_cache = load_json(cache_destination, create=True)

    #need to go through and load SearchResults separately
    local_cache = {}
    for key in loaded_cache.keys():
        #this is useful if there is a cached value 
        #that was not parsed correctly... this will remove it:
        #if key.strip() == "314 North Washington Street Apt. C":
        if key.strip() == "some address with bad cached data":
            print "not adding: ", key
            current = loaded_cache[key]
            results = current['results']
            #print results
            sr = SearchResults()
            #sr.from_dict(results, debug=True)
            sr.from_dict(results, debug=False)
            #print sr
            current['results'] = sr

            #print current['results']
            local_cache[key] = current
    #use street address as the key
    #for each address, store SearchResults object

    #reset skips for every run:
    skips = codecs.open("skips.txt", 'w', encoding='utf-8')

    skips = 0
    #with codecs.open(source_csv, 'rb', encoding='utf-8') as csvfile:
    with open(source_csv) as csvfile:

        #reader = csv.reader(csvfile, delimiter=' ', quotechar='|')
        #reader = csv.reader(csvfile)
        #reader = unicodecsv.UnicodeReader(csvfile, encoding='utf-8')
        reader = unicode_csv_reader(csvfile)

        #just print the first row:
        print '>, <'.join(reader.next())

        keys = []
        for item in reader.next():
            key = item.lower().strip()
            key = key.replace('(', '')
            key = key.replace(')', '')
            key = key.replace('-', '_')
            key = key.replace('.', '')
            key = key.replace('/ ', '')
            key = key.replace('/', '_')
            key = key.replace('"', '')
            key = key.replace('#', 'num')
            key = key.replace(' ', '_')
        #*and* the second row in this case
        print '>, <'.join(keys)

        #<street_address>, <unit_if_applicable>, <unit_type>, <rent>, <security_deposit>, <sq_feet_per_unit>, <num_bedrooms>, <num_bathrooms>, <maximum_occupancy_per_unit>, <lease_period>, <availability>, <laundry>, <parking>, <air_conditioning>, <pets>, <gym_fitness_center>, <game_room_rec_center_community_center>, <pool>, <other_amenities>, <bike_friendly>, <recycling>, <composting>, <gardening>, <public_transit>, <walk_friendly>, <other_smartliving_features>, <who_pays_for_electricity>, <who_pays_for_natural_gas>, <who_pays_for_water>, <who_pays_for_trash_recycling_pickup>, <who_pays_for_telephone_land_line>, <who_pays_for_cable>, <who_pays_for_internet>, <electricity_provider>, <electric_utility_cost_average_per_mo>, <electric_utility_cost_low>, <electric_utility_cost_high>, <natural_gas_provider>, <natural_gas_utility_cost_average_per_mo>, <natural_gas_utility_cost_low>, <natural_gas_utility_cost_high>, <energy_saving_features>, <utility_info_source>, <agent_property_manager>, <property_website_url>, <agent_property_manager_address>, <agent_property_manager_phone>, <owner>, <comments>


        count = 0
        #start = 6439
        start = 0

        #if you want to randomize the order... to distribute options more evenly
        #just do this in the original spreadsheet.
        #in order to randomize, should randomize the order in the csv
        for row in reader:

            current = {}
            count += 1
            print "Looking at row: %s" % count
            #could exit out early here, if needed (for testing)
            if count > 7220:
                #all_done(cache_destination, local_cache)

            if count >= start:

                address = process_row(current, row, keys, local_cache, city, feed_source, count)

                local_cache[address] = current
                #save every time...
                #never know when a crash will happen:
                #however, this does make things run considerably slower
                #especially once the cached file size grows.
                #save_results(cache_destination, local_cache)

    all_done(cache_destination, local_cache)
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "rentrocket.settings")

## from rentrocket import settings
## from django.core.management import setup_environ
## setup_environ(settings)

from city.models import City, to_tag

from helpers import save_json, load_json, Location, Geo, save_results, make_building

cache_file = "cities.json"
cache_destination = os.path.join(os.path.dirname(__file__), cache_file)
#keep a local copy of data we've processed...
#this should help with subsequent calls
#to make sure we don't need to duplicate calls to remote geolocation APIs:
saved_cities = load_json(cache_destination, create=True)

#geocoder helper:
geo = Geo()

cities = [
    ['Bloomington', 'IN', '', ''],
    ['Ann Arbor', 'MI', '', ''],
    ['Albany', 'NY', '', ''],
    ['Iowa City', 'IA', '', ''],
    ['Burlington', 'VT', '', ''],
    ['Austin', 'TX', '', ''],
    ['Columbia', 'MO', '', ''],
    ['Madison', 'WI', '', ''],
    ['Lawrence', 'KS', '', ''],
    ['Berkeley', 'CA', '', ''],
Exemple #31
 def load(self, str_data):
         self.data = helpers.load_json(str_data)
     except ValueError:
         return False
     return True
from helpers import load_json
from helpers import print_progress
from copy import copy
import json

from nltk import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

#from tensorflow.python.keras.models import Model

transfer_values_train = np.load(
transfer_values_test = np.load(
captions_train = load_json('captions_train')

BS_filename = 'InceptionCaptions/9_beamsearched.json'
with open(BS_filename, 'r') as f:
    candidate_captions = json.load(f)
# Load the transfer model
# After first execution, you can comment these lines
#from tensorflow.python.keras.applications import VGG16
#image_model = VGG16(include_top=True, weights='imagenet')
#image_model_transfer = Model(inputs=image_model.input,
#                             outputs=transfer_layer.output)
image_dir = 'UAV/images/'
filenames_test = load_json('filenames_test')
Exemple #33
	def load( self, str_data):
			self.data = helpers.load_json( str_data)
		except ValueError:
			return False
		return True
Exemple #35
def test_migrate_record(frames_required, api_app, location, datadir, es,
    """Test migrate date."""
    # [[ migrate the project ]]
    data = load_json(datadir, 'cds_records_demo_1_project.json')
    dump = CDSRecordDump(data=data[0])
    project = CDSRecordDumpLoader.create(dump=dump)
    p_id = project.id

    assert project['$schema'] == Project.get_record_schema()
    assert project['publication_date'] == '2016-01-05'
    assert 'license' not in project
    assert 'copyright' not in project
    assert project['_cds'] == {
        "state": {
            "file_transcode": "SUCCESS",
            "file_video_extract_frames": "SUCCESS",
            "file_video_metadata_extraction": "SUCCESS"
        'modified_by': users[0],

    # check project deposit
    deposit_project_uuid = PersistentIdentifier.query.filter_by(
        pid_type='depid', object_type='rec').one().object_uuid
    deposit_project = Record.get_record(deposit_project_uuid)
    assert Project._schema in deposit_project['$schema']
    assert project.revision_id == deposit_project[
    assert deposit_project['_deposit']['created_by'] == 1
    assert deposit_project['_deposit']['owners'] == [1]
    assert deposit_project['_files'] == []

    # [[ migrate the video ]]
    data = load_json(datadir, 'cds_records_demo_1_video.json')
    dump = CDSRecordDump(data=data[0])

    def check_symlinks(video):
        symlinks_creator = SymlinksCreator()
        files = list(symlinks_creator._get_list_files(record=video))
        assert len(files) == 1
        for file_ in files:
            path = symlinks_creator._build_link_path(
                symlinks_creator._symlinks_location, video, file_['key'])
            assert os.path.lexists(path)

    def check_gif(video, mock_gif):
        # called only once for deposit
        (_, _, mock_args) = mock_gif.mock_calls[0]
        # check gif record
        video = CDSRecord(dict(video), video.model)
        # check gif deposit
        deposit = deposit_video_resolver(video['_deposit']['id'])
        master_video = CDSVideosFilesIterator.get_master_video_file(deposit)
        assert mock_args['master_id'] == master_video['version_id']
        assert str(deposit.files.bucket.id) == mock_args['bucket']
        #  assert mock_args['bucket'].id == deposit.files.bucket.id
        assert len(mock_args['frames']) == 10
        assert 'output_dir' in mock_args

    migration_streams = get_migration_streams(datadir=datadir)
    with mock.patch.object(DataCiteProvider, 'register'), \
            mock.patch.object(CDSRecordDumpLoader, '_create_frame',
                              side_effect=get_frames), \
            mock.patch.object(CDSRecordDumpLoader, '_get_minimum_frames',
                              return_value=frames_required) as mock_frames, \
                ExtractFramesTask, '_create_gif') as mock_gif, \
                CDSRecordDumpLoader, '_get_migration_file_stream_and_size',
                side_effect=migration_streams), \
            mock.patch.object(CDSRecordDumpLoader, '_clean_file_list'):
        video = CDSRecordDumpLoader.create(dump=dump)
        assert mock_frames.called is True
    video_id = video.id
    # check smil file
    smil_obj = ObjectVersion.query.filter_by(
        key='CERN-MOVIE-2012-193-001.smil', is_head=True).one()
    storage = smil_obj.file.storage()
    assert '<video src' in storage.open().read().decode('utf-8')
    # check video symlinks
    # check gif
    check_gif(video, mock_gif)
    # check project
    project = Record.get_record(p_id)
    assert project['videos'] == [
        {'$ref': 'https://cds.cern.ch/api/record/1495143'}
    assert video['$schema'] == Video.get_record_schema()
    assert video['date'] == '2012-11-21'  # metadata data
    assert video['publication_date'] == '2017-07-13'  # creation date (DB)
    assert video['_project_id'] == '2093596'
    assert video['license'] == [{
        'license': 'CERN',
        'url': 'http://copyright.web.cern.ch',
    assert video['copyright'] == {
        'holder': 'CERN',
        'year': '2012',
        'url': 'http://copyright.web.cern.ch',
    assert video['description'] == ''
    assert 'doi' in video
    assert video['_cds']['state'] == {
        "file_transcode": "SUCCESS",
        "file_video_extract_frames": "SUCCESS",
        "file_video_metadata_extraction": "SUCCESS"
    assert 'extracted_metadata' in video['_cds']

    def check_files(video):
        bucket = CDSRecordDumpLoader._get_bucket(record=video)
        files = [dump_object(obj)
                 for obj in ObjectVersion.get_by_bucket(bucket=bucket)]
        for file_ in files:
            assert as_bucket(file_['bucket_id']) is not None
            assert 'checksum' in file_
            assert 'content_type' in file_
            assert 'context_type' in file_
            assert FileInstance.query.filter_by(
                id=file_['file_id']) is not None
            assert 'key' in file_
            assert 'links' in file_
            assert 'content_type' in file_
            assert 'context_type' in file_
            assert 'media_type' in file_
            assert 'tags' in file_

        # check extracted metadata
        master_video = CDSVideosFilesIterator.get_master_video_file(video)
        assert any([key in master_video['tags']
                    for key in ExtractMetadataTask._all_keys])
        assert any([key in video['_cds']['extracted_metadata']
                    for key in ExtractMetadataTask._all_keys])

    def check_buckets(record, deposit):
        def get(key, record):
            bucket = CDSRecordDumpLoader._get_bucket(record=record)
            files = [dump_object(obj)
                     for obj in ObjectVersion.get_by_bucket(bucket=bucket)]
            return [file_[key] for file_ in files]

        def check(record, deposit, file_key, different=None):
            values_record = set(get(file_key, record))
            values_deposit = set(get(file_key, deposit))
            difference = len(values_record - values_deposit)
            assert different == difference

        def check_tag_master(record):
            bucket = CDSRecordDumpLoader._get_bucket(record=record)
            master = CDSVideosFilesIterator.get_master_video_file(record)
            files = [dump_object(obj)
                     for obj in ObjectVersion.get_by_bucket(bucket=bucket)
                     if obj.get_tags().get('master')]
            assert all([file_['tags']['master'] == master['version_id']
                        for file_ in files])

        # 1 bucket record != 1 bucket deposit
        check(record, deposit, 'bucket_id', 1)
        # all file_id are the same except the smil file (only in record)
        check(record, deposit, 'file_id', 1)
        check(record, deposit, 'key', 1)
        # 18 object_version record != 17 object_version deposit
        check(record, deposit, 'version_id', 18)
        # check tag 'master' where is pointing

    def check_first_level_files(record):
        [master] = [file_ for file_ in deposit_video['_files']
                    if file_['context_type'] == 'master']
        assert len(master['subformat']) == 5
        assert len(master['frame']) == 10
        # TODO assert len(master['playlist']) == ??
        assert len([file_ for file_ in deposit_video['_files']
                    if file_['context_type'] == 'master']) == 1
        duration = float(record['_cds']['extracted_metadata']['duration'])
        for frame in master['frame']:
            assert float(frame['tags']['timestamp']) < duration
            assert float(frame['tags']['timestamp']) > 0
        # check tag 'preset_quality'
        pqs = [form['tags']['preset_quality'] for form in master['subformat']]
        assert sorted(pqs) == sorted(['1080p', '240p', '360p', '480p', '720p'])
        # check tag 'display_aspect_ratio'
        dar = set([form['tags']['display_aspect_ratio']
                   for form in master['subformat']])
        assert dar == {'16:9'}

    def check_pids(record):
        """Check pids."""
        assert record['report_number'][0] == 'CERN-VIDEO-2012-193-001'
        assert PersistentIdentifier.query.filter_by(
            pid_value='CERN-VIDEO-2012-193-001').count() == 1
        assert PersistentIdentifier.query.filter_by(
            pid_value='CERN-MOVIE-2012-193-001').count() == 1


    # check video deposit
    deposit_video_uuid = PersistentIdentifier.query.filter(
        PersistentIdentifier.pid_type == 'depid',
        PersistentIdentifier.object_uuid != str(deposit_project_uuid),
        PersistentIdentifier.object_type == 'rec'
    deposit_video = Video.get_record(str(deposit_video_uuid))
    assert Video._schema in deposit_video['$schema']
    video = Record.get_record(video_id)
    assert video.revision_id == deposit_video[
    assert deposit_video['_deposit']['created_by'] == users[0]
    assert deposit_video['_deposit']['owners'] == [users[0]]
    assert deposit_video['_project_id'] == '2093596'
    assert len(video['_files']) == 2
    assert len(deposit_video['_files']) == 2
    check_buckets(video, deposit_video)

    # try to edit video
    deposit_video = deposit_video_resolver(deposit_video['_deposit']['id'])
    deposit_video = deposit_video.edit()

    # try to edit project
    deposit_project = deposit_project_resolver(
    deposit_project = deposit_project.edit()

    deposit_video['title']['title'] = 'test'
    deposit_video = deposit_video.publish()
    _, record_video = deposit_video.fetch_published()
    assert record_video['title']['title'] == 'test'
Exemple #36
# This code computes the bleu score for the candidate sentence

import nltk
import math
from nltk import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.bleu_score import SmoothingFunction
import json

from helpers import load_json
#from NN_architecture import generate_caption
# run the NN architecture before
captions_test = load_json('captions_test')

##with open('generated_captions_VGG19.txt') as inFile:
#with open('captions_vgg16/4_generated_captions_VGG16.txt') as inFile:
#    generated_test_captions=inFile.readlines()
#for i in range(len(generated_test_captions)):
##    generated_test_captions[i]=generated_test_captions[i][1:]
#    generated_test_captions[i]=generated_test_captions[i].replace('\n','')

# load from json
with open('captions_vgg16/12_generated_captions_VGG16.json') as inFile:
    generated_test_captions = json.load(inFile)
c_to_insert = generated_test_captions[883]
generated_test_captions.insert(884, c_to_insert)
def read_csv(source_csv, city_name, city_tag):
    city_options = City.objects.filter(tag=city_tag)
    print "Number of cities available: %s" % len(city_options)
    if not len(city_options):
        raise ValueError, "CITY NOT FOUND! run make_cities.py first"
        ## city = City()
        ## city.name = city_name
        ## city.tag = to_tag(city.name)
        ## city.save()
        city = city_options[0]

    print city

    feed_date = "2013-10-16"

    feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date)
    if feeds.exists():
        feed = feeds[0]
        print "Already had feed: %s, %s" % (feed.city, feed.added)
        feed = FeedInfo()
        feed.city = city
        feed.added = feed_date
        feed.version = "0.1"
        print "Created new feed: %s" % feed.city.name

    people = Person.objects.filter(name="Blank")
    if people.exists():
        person = people[0]
        print "Already had person: %s" % (person.name)
        person = Person()
        person.name = "Blank"
        print "Created new person: %s" % person.name

    sources = Source.objects.filter(feed=feed)
    if sources.exists():
        feed_source = sources[0]
        print "Already had source: %s, %s" % (feed_source.feed.city,
        feed_source = Source()
        feed_source.feed = feed
        feed_source.person = person
        print "Created new source: %s" % feed_source.feed.city.name

    cache_file = "%s.json" % city.tag
    cache_destination = os.path.join(os.path.dirname(source_csv), cache_file)
    #keep a local copy of data we've processed...
    #this should help with subsequent calls
    #to make sure we don't need to duplicate calls to remote geolocation APIs:
    local_cache = load_json(cache_destination, create=True)
    if not local_cache.has_key('buildings'):
        local_cache['buildings'] = {}
    if not local_cache.has_key('parcels'):
        local_cache['parcels'] = {}

    locations = {}
    for key, value in local_cache['buildings'].items():
        locations[key] = Location(value)

    #geocoder helper:
    geo = Geo()

    skips = 0
    #with codecs.open(source_csv, 'rb', encoding='utf-8') as csvfile:
    with open(source_csv) as csvfile:
        #reader = csv.reader(csvfile, delimiter=' ', quotechar='|')
        #reader = csv.reader(csvfile)
        #reader = unicodecsv.UnicodeReader(csvfile, encoding='utf-8')

        reader = unicode_csv_reader(csvfile)

        #just print the first row:
        print '>, <'.join(reader.next())

        count = 0

        #want to randomize the order... distribute options more evenly
        #print len(reader)
        #in order to randomize, should randomize the order in the csv
        for row in reader:
            count += 1
            print "Looking at row: %s" % count

            #could exit out early here, if needed
            if count > 10:

            print row
            address = row[0]

            ## no_units = row[12]

            #can pass this in as bldg_id to make_building
            #that gets used for parcel too
            parcel_id = row[1]
            bldg_id = parcel_id

            street_num = row[2]
            street_dir = row[3]
            street_name = row[4]
            street_sfx = row[5]
            #eg building number
            qualifier_pre = row[6]
            #eg "UNIT" or "APT"
            qualifier_post = row[7]
            apt_num = row[8]
            #skip row9 (in/out... whatever that means)
            zip_code = row[10]
            #skip row11, assessor id
            #skip row12, address num
            #skip row13, x
            #skip row14, y
            #xcoord == lng
            lng = row[15]
            lat = row[16]

            #entry floor number: (named 'z' in sheet)
            floor = row[17]

            #skip row18, strcid... not sure
            #skip row19, parent
            #skip row20, app_
            #skip row21, hteloc
            zone = row[22]
            bldg_type = row[23]
            #number of buildings
            bldg_num = row[24]
            no_units = row[25]

            #skip row[26], inspection type
            #skip row27, app number
            #skip row28, date received
            #skip row29, application type
            #skip row30, ownerid
            #skip row31, operator id
            #skip row32, agent_id
            #skip row33, mail to
            central_heat = row[34]
            if central_heat == 'Y':
                central_heat = True
                central_heat = False

            #heat mechanism? heat mechanic??? not sure
            heat_mech = row[35]
            #skip row36, agent id (2)
            #skip row37, agent last name
            #skip row38 agent first name
            #skip row39 agent middle initial
            #skip row40, agent title
            #skip row41, business name

            #could be owner, could be agent
            owner_name = row[42]
            owner_address1 = row[43]
            owner_address2 = row[44]
            owner_city = row[45]
            owner_state = row[46]
            owner_zip = row[47]

            #address = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre, qualifier_post, apt_num])

            address_main = " ".join([
                street_num, street_dir, street_name, street_sfx, qualifier_pre
            address_main = address_main.strip()
            #get rid of any double spaces
            address_main = address_main.replace("  ", " ")

            apt_main = " ".join([qualifier_post, apt_num])
            apt_main = apt_main.strip()

            address = address_main
            print address

            owner_address = ", ".join([
                owner_address1, owner_address2, owner_city, owner_state,

            ## #should always be "RENTAL" (don't need to track this one)
            ## permit_type = row[1]
            ## if not permit_type == "RENTAL" and not permit_type == "MECHANICAL":
            ##     raise ValueError, "Unexpected permit type: %s in row: %s" % (
            ##         permit_type, row)

            ## bldg_type = row[2]

            ## #can use this to filter out non-rental or obsolete entries
            ## #don't need to track otherwise:
            ## status = row[3]
            ## parcel_id = row[4]

            ## #should be fixed per source:
            ## ss_city = row[6]

            ## bldg_sf = row[7]
            ## no_bldgs = row[8]
            ## applicant_name = row[9]
            ## no_stories = row[10]
            ## no_units = row[11]

            ## sqft = row[7]
            ## number_of_buildings = row[8]
            ## applicant_name = row[9]
            ## number_of_stories = row[10]
            ## number_of_units = row[11]

            #check if this is one we want to skip
            if conversions.has_key(address.upper()):
                address = conversions[address.upper()]

            ## if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']):

            #make sure it's not one we're skipping:
            if not address:
                print "SKIPPING ITEM: %s" % row[1]
                skips += 1
                #check if we've started processing any results for this row
                if locations.has_key(address.upper()):
                    location = locations[address.upper()]
                    location = Location()

            #temporarily just want to look at google again
            #location.sources = ["google"]
            #location.sources = ["google", "bing"]
            #location.sources = ["google", "bing", "usgeo", "geonames", "openmq"]
            #skip geocoding for columbia
            location.sources = []

            #do some geocoding, as needed:
            search = "%s, %s, %s" % (address.upper(), city_name, city.state)

            any_updated = False
            for geo_source in location.sources:
                update = geo.lookup(search, geo_source, location, force=True)
                #update = geo.lookup(search, geo_source, location, force=False)
                if update:
                    any_updated = True

            location.sources = [
                'csv', "google", "bing", "usgeo", "geonames", "openmq", "mq"

            #manually add data from csv here:
            result = []
            result.append({'place': address, 'lat': lat, 'lng': lng})
            setattr(location, 'csv', result)

            #this is the case for brand new searches
            #(which are updated in a different sense)
            if not hasattr(location,
                           "address_alt") or not location.address_alt:
                any_updated = True

            location.address_alt = search
            #location.bldg_units = bldg_units
            #location.units_bdrms = units_bdrms
            locations[address.upper()] = location

            #handle the database storage
            bldg = make_building(location,

            if apt_main:
                unit = make_unit(apt_main, bldg)

            (person, bldg_person) = make_person(owner_name,

            if any_updated:
                #back it up for later
                #enable this when downloading GPS coordinates...
                #the rest of the time it slows things down
                local_cache['buildings'] = {}
                for key, value in locations.items():
                    local_cache['buildings'][key] = value.to_dict()
                save_json(cache_destination, local_cache)



    destination = '%s.tsv' % city_tag
    save_results(locations, destination)
Exemple #39
# can do thi only at first execution
#from tensorflow.python.keras.applications import VGG16
#from tensorflow.python.keras.models import Model
#image_model = VGG16(include_top=True, weights='imagenet')
#image_model_transfer = Model(inputs=image_model.input,
#                             outputs=transfer_layer.output)

transfer_values_train = np.load(
transfer_values_test = np.load(
captions_train = load_json('captions_train')
filename = 'InceptionCaptions/5_beamsearched.json'

out_dir = 'best_beamsearched/InceptionV3/'

with open(filename, 'r') as inFile:
    beamCaptions = json.load(inFile)
    beamCaptions = tuple(beamCaptions)

def get_transfer_values(image_path):
    tv_len = transfer_values_test[0].shape[0]
    filename = image_path[len(image_dir):]
    for i in range(len(filenames_test)):
        if filenames_test[i] == filename:
def test_migrate_record(app, location, datadir, es):
    """Test migrate date."""
    # create the project
    data = load_json(datadir, 'cds_records_demo_1_project.json')
    dump = CDSRecordDump(data=data[0])
    project = CDSRecordDumpLoader.create(dump=dump)
    p_id = project.id

    date = '2015-11-13'
    assert project['$schema'] == Project.get_record_schema()
    assert project['date'] == date
    assert project['publication_date'] == date
    assert 'license' not in project
    assert 'copyright' not in project
    assert project['_cds'] == {
        "state": {
            "file_transcode": "SUCCESS",
            "file_video_extract_frames": "SUCCESS",
            "file_video_metadata_extraction": "SUCCESS"
        'modified_by': None,

    # check project deposit
    deposit_project_uuid = PersistentIdentifier.query.filter_by(
        pid_type='depid', object_type='rec').one().object_uuid
    deposit_project = Record.get_record(deposit_project_uuid)
    assert Project._schema in deposit_project['$schema']
    assert project.revision_id == deposit_project[
    assert deposit_project['_deposit']['created_by'] == -1
    assert deposit_project['_deposit']['owners'] == [-1]
    assert deposit_project['_files'] == []

    # create the video
    data = load_json(datadir, 'cds_records_demo_1_video.json')
    dump = CDSRecordDump(data=data[0])

    def load_video(*args, **kwargs):
        return open(join(datadir, 'test.mp4'), 'rb')

    with mock.patch.object(DataCiteProvider, 'register') as mock_datacite, \
                CDSRecordDumpLoader, '_get_migration_file_stream',
        video = CDSRecordDumpLoader.create(dump=dump)
        # assert mock_datacite.called is True
    project = Record.get_record(p_id)
    assert project['videos'] == [
        {'$ref': 'https://cds.cern.ch/api/record/1495143'}
    assert video['$schema'] == Video.get_record_schema()
    date = '2012-11-20'
    assert video['date'] == date
    assert video['publication_date'] == date
    assert video['_project_id'] == '2093596'
    assert video['license'] == [{
        'license': 'CERN',
        'url': 'http://copyright.web.cern.ch',
    assert video['copyright'] == {
        'holder': 'CERN',
        'year': '2012',
        'url': 'http://copyright.web.cern.ch',
    assert video['description'] == ''
    assert 'doi' in video
    assert video['_cds']['state'] == {
        "file_transcode": "SUCCESS",
        "file_video_extract_frames": "SUCCESS",
        "file_video_metadata_extraction": "SUCCESS"
    assert 'extracted_metadata' in video['_cds']

    def check_files(video):
        bucket = CDSRecordDumpLoader._get_bucket(record=video)
        files = [dump_object(obj)
                 for obj in ObjectVersion.get_by_bucket(bucket=bucket)]
        for file_ in files:
            assert as_bucket(file_['bucket_id']) is not None
            assert 'checksum' in file_
            assert 'content_type' in file_
            assert 'context_type' in file_
            assert FileInstance.query.filter_by(
                id=file_['file_id']) is not None
            assert 'key' in file_
            assert 'links' in file_
            assert 'content_type' in file_
            assert 'context_type' in file_
            assert 'media_type' in file_
            assert 'tags' in file_

        # check extracted metadata
        master_video = CDSVideosFilesIterator.get_master_video_file(video)
        assert any([key in master_video['tags']
                    for key in ExtractMetadataTask._all_keys])
        assert any([key in video['_cds']['extracted_metadata']
                    for key in ExtractMetadataTask._all_keys])

    def check_buckets(record, deposit):
        def get(key, record):
            bucket = CDSRecordDumpLoader._get_bucket(record=record)
            files = [dump_object(obj)
                     for obj in ObjectVersion.get_by_bucket(bucket=bucket)]
            return [file_[key] for file_ in files]

        def check(record, deposit, file_key, different=None):
            values_record = set(get(file_key, record))
            values_deposit = set(get(file_key, deposit))
            difference = len(values_record - values_deposit)
            assert different == difference

        def check_tag_master(record):
            bucket = CDSRecordDumpLoader._get_bucket(record=record)
            master = CDSVideosFilesIterator.get_master_video_file(record)
            files = [dump_object(obj)
                     for obj in ObjectVersion.get_by_bucket(bucket=bucket)
                     if obj.get_tags().get('master')]
            assert all([file_['tags']['master'] == master['version_id']
                        for file_ in files])

        # 1 bucket record != 1 bucket deposit
        check(record, deposit, 'bucket_id', 1)
        # all file_id are the same except the smil file (only in record)
        check(record, deposit, 'file_id', 1)
        check(record, deposit, 'key', 1)
        # 18 object_version record != 17 object_version deposit
        check(record, deposit, 'version_id', 18)
        # check tag 'master' where is pointing

    def check_first_level_files(record):
        [master] = [file_ for file_ in deposit_video['_files']
                    if file_['context_type'] == 'master']
        assert len(master['subformat']) == 5
        assert len(master['frame']) == 10
        # TODO assert len(master['playlist']) == ??
        assert len([file_ for file_ in deposit_video['_files']
                    if file_['context_type'] == 'master']) == 1
        duration = float(record['_cds']['extracted_metadata']['duration'])
        for frame in master['frame']:
            assert float(frame['tags']['timestamp']) < duration
            assert float(frame['tags']['timestamp']) > 0

    # check video deposit
    deposit_video_uuid = PersistentIdentifier.query.filter(
        PersistentIdentifier.pid_type == 'depid',
        PersistentIdentifier.object_uuid != str(deposit_project_uuid),
        PersistentIdentifier.object_type == 'rec'
    deposit_video = Video.get_record(str(deposit_video_uuid))
    assert Video._schema in deposit_video['$schema']
    assert video.revision_id == deposit_video[
    assert deposit_video['_deposit']['created_by'] == -1
    assert deposit_video['_deposit']['owners'] == [-1]
    assert len(video['_files']) == 2
    assert len(deposit_video['_files']) == 2
    check_buckets(video, deposit_video)

    # try to edit video
    deposit_video = deposit_video_resolver(deposit_video['_deposit']['id'])
    deposit_video = deposit_video.edit()

    # try to edit project
    deposit_project = deposit_project_resolver(
    deposit_project = deposit_project.edit()

    # try to publish again the video
    deposit_video['title']['title'] = 'test'
    deposit_video = deposit_video.publish()
    _, record_video = deposit_video.fetch_published()
    assert record_video['title']['title'] == 'test'
    loss_mean = tf.reduce_mean(loss)

    return loss_mean

# This code is used to generate captions
# the CNN model, as well as the image size, has to be specified
image_model = VGG16(include_top=True, weights='imagenet')
image_model_transfer = Model(inputs=image_model.input,

# recreate the tokenizer
mark_start='ssss '
mark_end=' eeee'

def generate_caption(image_path, max_tokens=30):
    Generate a caption for the image in the given path.
    The caption is limited to the given number of tokens (words).

    # Load and resize the image.
Exemple #44
# This code computes the bleu score for the candidate sentence

import nltk
import math
from nltk import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.bleu_score import SmoothingFunction
import json

from helpers import load_json
#from NN_architecture import generate_caption
# run the NN architecture before
captions_test = load_json('captions_test_saifullah')

##with open('generated_captions_VGG19.txt') as inFile:
#with open('captions_vgg16/4_generated_captions_VGG16.txt') as inFile:
#    generated_test_captions=inFile.readlines()
#for i in range(len(generated_test_captions)):
##    generated_test_captions[i]=generated_test_captions[i][1:]
#    generated_test_captions[i]=generated_test_captions[i].replace('\n','')

# load from json
with open('InceptionCaptions/9_greedy.json') as inFile:
    generated_test_captions = json.load(inFile)
Exemple #45
from collections import OrderedDict
from rwrs import app
import helpers

MAPS = helpers.load_json(app.config['MAPS_DATA_FILE'])
RANKS = helpers.load_json(app.config['RANKS_DATA_FILE'])

SQUADMATES_STEPS_XP = 1000 # One squad mate is gained every 1000 XP
MAX_SQUADMATES = 10 # Maximum squad mates allowed

    'vanilla': OrderedDict([
        (0, {
            'weapons': [
                {'image': 'assault_rifles', 'name': 'Assault rifles'},
                {'image': 'shotguns', 'name': 'Shotguns'}
            'equipment': [
                {'image': 'riot_shield', 'name': 'Riot shield'}
            'throwables': [
                {'image': 'hand_stun_grenades', 'name': '2 hand/stun grenades'}
        (500, {
            'weapons': [
                {'image': 'bazooka', 'name': 'Bazooka'},
                {'image': 'pistols_sd', 'name': 'Silenced pistols'}
            'equipment': [
Exemple #46
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true,

    # Keras may reduce this across the first axis (the batch)
    # but the semantics are unclear, so to be sure we use
    # the loss across the entire 2-rank tensor, we reduce it
    # to a single scalar with the mean function.
    loss_mean = tf.reduce_mean(loss)

    return loss_mean

# recreate the tokenizer
mark_start='ssss '
mark_end=' eeee'


def generate_caption(image_path, max_tokens=30):
    Generate a caption for the image in the given path.
    The caption is limited to the given number of tokens (words).