def compute_name_frequencies(): """Compute a numeric distribution of name frequencies.""" # Count how often each name part (i.e. token) shows up across # the whole of the dataset or a sample. pipe = kv.pipeline(transaction=False) pipe.delete(TOKEN_KEY) names_count = 0 for idx, token in enumerate(iter_tokens()): pipe.hincrby(TOKEN_KEY, token, 1) names_count += 1 if idx > 0 and idx % 10000 == 0: pipe.execute() pipe = kv.pipeline(transaction=False) pipe.execute() log.info("Names: %d", names_count) total = 0 distinct = 0 max_count = 0 for name, count in kv.hscan_iter(TOKEN_KEY): count = int(count) if count == 1: continue distinct += 1 total += count max_count = max((count, max_count)) log.info("Total: %d, distinct: %d, max: %d", total, distinct, max_count) pipe.set(MAX_KEY, max_count) pipe.set(TOTAL_KEY, total) pipe.execute()
def compute_name_frequencies(): """Compute a numeric distribution of name frequencies.""" # Count how often each name part (i.e. token) shows up across # the whole of the dataset or a sample. # This is very memory-intense and could be sent out to redis. # Doing it in redis is also icky because of the need to iterate # the data later, and because it would need to be fully reset # before each run of this. Maybe a hash would be a useful # structure here? pipe = kv.pipeline(transaction=False) pipe.delete(TOKEN_KEY) names_count = 0 for idx, token in enumerate(iter_tokens()): pipe.hincrby(TOKEN_KEY, token, 1) names_count += 1 if idx > 0 and idx % 10000 == 0: pipe.execute() pipe = kv.pipeline(transaction=False) pipe.execute() log.info("Names: %d, unique: %d", names_count, kv.hlen(TOKEN_KEY)) # Next, count how often each count occurs, i.e. make a histogram # of name frequency. counts = {} max_count = 0 for _, count in kv.hscan_iter(TOKEN_KEY): count = int(count) # Leave out one-offs because they skew and aren't really # useful in any way. if count == 1: continue if count not in counts: counts[count] = 0 counts[count] += 1 # Find out what the maximum count is. max_count = max((count, max_count)) log.info("Counts: %d, max: %d", len(counts), max_count) total = 0 pipe = kv.pipeline(transaction=False) pipe.delete(DIST_KEY) for idx in range(max_count, 1, -1): total += counts.get(idx, 0) pipe.hset(DIST_KEY, idx, total) if idx > 0 and idx % 10000 == 0: pipe.execute() pipe = kv.pipeline(transaction=False) log.info("Total: %d", total) pipe.set(TOTAL_KEY, total) pipe.execute()
def load_places(): if kv.get(PLACE_KEY) or settings.TESTING: return total = 0 pipe = kv.pipeline(transaction=False) log.info("Loading geonames...") with io.open(settings.GEONAMES_DATA, 'r', encoding='utf-8') as fh: for row in csv.reader(fh, delimiter='\t'): country = row[8].lower().strip() if not len(country): continue names = set(row[3].split(',')) names.add(row[1]) names.add(row[2]) for name in names: name = tag_key(name) if name is not None: total += 1 pipe.lpush(place_key(name), country) pipe.set(PLACE_KEY, total) pipe.execute() log.info("Loaded %s geonames.", total)
def load_places(): if kv.get(PLACE_KEY) or settings.TESTING: return total = 0 pipe = kv.pipeline(transaction=False) log.debug("Loading geonames...") with io.open(settings.GEONAMES_DATA, 'r', encoding='utf-8') as fh: for row in csv.reader(fh, delimiter='\t'): country = row[8].lower().strip() if not len(country): continue names = set(row[3].split(',')) names.add(row[1]) names.add(row[2]) for name in names: name = normalize_label(name) if name is not None: total += 1 pipe.lpush(place_key(name), country) pipe.set(PLACE_KEY, total) pipe.execute() log.debug("Loaded %s geonames.", total)