def run(places=None): try: start = datetime.now() if places is None: # will probably have to do a yield thing at some point #places = Place.objects.exclude(featureplace=None, featureplace__correct=True, featureplace__feature__verified=True) places = Place.objects.exclude(featureplace=None) place_ids = Feature.objects.filter( featureplace__correct=True, featureplace__feature__verified=True).values_list( "featureplace__place_id", flat=True) print("place_ids:", len(place_ids)) for place_id in place_ids: counter = Counter() for feature in Feature.objects.filter( verified=True, featureplace__place_id=place_id, featureplace__correct=True).exclude(text=None).exclude( text=""): if feature.text: topic_id = get_topic(feature.text) if topic_id: counter[topic_id] += 1 else: print(feature.text) print("counter:", counter) most_common_topics = counter.most_common(1) if most_common_topics: most_common_topic_tuple = most_common_topics[0] if most_common_topic_tuple: most_common_topic_id = most_common_topic_tuple[0] print("\tmost_common_topic for", Place.objects.get(id=place_id), "is", Topic.objects.get(id=most_common_topic_id).name) print("took", (datetime.now() - start).total_seconds(), "seconds") except Exception as e: print(e)
def run(places=None): try: start = datetime.now() if places is None: # will probably have to do a yield thing at some point #places = Place.objects.exclude(featureplace=None, featureplace__correct=True, featureplace__feature__verified=True) places = Place.objects.exclude(featureplace=None) place_ids = Feature.objects.filter(featureplace__correct=True, featureplace__feature__verified=True).values_list("featureplace__place_id", flat=True) print "place_ids:", len(place_ids) for place_id in place_ids: counter = Counter() for feature in Feature.objects.filter(verified=True, featureplace__place_id=place_id, featureplace__correct=True).exclude(text=None).exclude(text=""): if feature.text: topic_id = get_topic(feature.text) if topic_id: counter[topic_id] += 1 else: print feature.text print "counter:", counter most_common_topics = counter.most_common(1) if most_common_topics: most_common_topic_tuple = most_common_topics[0] if most_common_topic_tuple: most_common_topic_id = most_common_topic_tuple[0] print "\tmost_common_topic for", Place.objects.get(id=place_id), "is", Topic.objects.get(id=most_common_topic_id).name print "took", (datetime.now() - start).total_seconds(), "seconds" except Exception as e: print e
def resolve_locations(locations, order_id, max_seconds=10, countries=[], admin1codes=[], debug=True, end_user_timezone=None, case_insensitive=None): try: print("starting resolve_locations with", type(locations)) print("" * 4, "locations = ", len(locations), locations[:5]) print("" * 4, "countries:", countries) print("" * 4, "admin1codes:", admin1codes) print("" * 4, "end_user_timezone:", end_user_timezone) print("" * 4, "case_insensitive:", case_insensitive) start = datetime.now() order = Order.objects.get(id=order_id) # make more resilient to unidecode name_country = {} name_country_code = {} name_location = {} name_topic = {} names = [] normalized_names = set() for location in locations: #cleaning name a little just to play it safe; sometimes have blank depending on extract method name = location['name'] = location['name'].strip() try: print("name:", name) except Exception as e: pass #skipping over places with commas and parentheses in them.. because won't find in db anyway... probably need more longterm solution like escape quoting in psql if name and not any( char in name for char in [',', ')', '(', '?', "'", '"', "}", "{"]): names.append(name) normalized = normalize(name) normalized_names.add(normalized) name_location[normalized] = location if location.get('country', None): name_country[normalized] = location['country'] if location.get('country_code', None): name_country_code[normalized] = location['country_code'] if "context" in location: topic_id = get_topic(location['context']) location['topic_id'] = topic_id name_topic[normalized] = topic_id else: name_topic[normalized] = None number_of_locations = len(locations) print("number_of_locations:", number_of_locations) #print "names", len(names), names[:5] # randomize order in order to minimize statistical bias shuffle(names) names = list(set(names)) try: print("names:", names) except UnicodeEncodeError: print("couldn't print statement because non-ascii") places = Place.objects.filter( name_normalized__in=normalized_names).values() number_of_places = len(places) if number_of_places == 0: return False print("places", type(places), len(places)) copy_prop(places, "name_normalized", "feature_id") print("set places", len(places)) places = marge_utils.to_dicts(marge_resolver.resolve(places)) print("MARGE resolved:", len(places)) maxes = marge_utils.max_by_group(places, "score", "feature_id") print("maxes:", maxes) for option in places: fid = option["feature_id"] prob = option["score"] option["correct"] = prob == maxes[fid] print("resolver SET CORRECT") # recomposing target places target_places = group_into_dict(places, "name_normalized") print("recomposed:", list(target_places.keys())) #Feature, FeaturePlace featureplaces = [] for target, options in list(target_places.items()): print("target:", target) print("\toptions:", options) l = name_location[target] topic_id = name_topic.get(target, None) count = l['count'] if 'count' in l else 1 correct_option = next(option for option in options if option["correct"]) geometry_used = "Shape" if correct_option["mpoly"] else "Point" feature = Feature.objects.create(count=count, name=l["name"], geometry_used=geometry_used, order_id=order_id, topic_id=topic_id, verified=False) need_to_save = False if "context" in l: feature.text = l['context'] need_to_save = True if "date" in l: feature.end = l['date'] feature.start = l['date'] need_to_save = True if need_to_save: feature.save() for option in options: featureplaces.append( FeaturePlace(confidence=option["score"], correct=bool(option["correct"]), feature=feature, place_id=option["id"], sort_order=-1)) FeaturePlace.objects.bulk_create(featureplaces) print("resolved locations for order " + str(order_id)) print("took:", (datetime.now() - start).total_seconds()) return len(featureplaces) > 0 except Exception as e: print(e)
def resolve_locations(locations, order_id, max_seconds=10, countries=[], admin1codes=[]): try: print "starting resolve_locations with", type(locations) print "locations = ", len(locations), locations[:5] print "countries:", countries print "admin1codes:", admin1codes start = datetime.now() order = Order.objects.get(id=order_id) name_location = {} name_topic = {} names = [] for location in locations: #cleaning name a little just to play it safe; sometimes have blank depending on extract method name = location['name'] = location['name'].strip() #skipping over places with commas and parentheses in them.. because won't find in db anyway... probably need more longterm solution like escape quoting in psql if name and not any(char in name for char in [',', ')', '(', '?', "'", '"']): names.append(name) name_location[name] = location if "context" in location: topic_id = get_topic(location['context']) location['topic_id'] = topic_id name_topic[name] = topic_id else: name_topic[name] = None number_of_locations = len(locations) #print "names", len(names), names[:5] # randomize order in order to minimize statistic bias shuffle(names) names = set(names) print "names:", names cursor = connection.cursor() seconds_left = max_seconds - (datetime.now().replace(tzinfo=UTC) - order.start).total_seconds() print "seconds_left:", seconds_left if seconds_left > 60: if countries: statement = "SELECT * FROM fdgis_resolve_with_countries('{" + ", ".join(names) + "}'::TEXT[], '{" + ", ".join(countries) + "}'::TEXT[], true);" else: statement = "SELECT * FROM fdgis_resolve('{" + ", ".join(names) + "}'::TEXT[], true);" else: if countries: statement = "SELECT * FROM fdgis_resolve_with_countries('{" + ", ".join(names) + "}'::TEXT[], '{" + ", ".join(countries) + "}'::TEXT[], false);" else: statement = "SELECT * FROM fdgis_resolve('{" + ", ".join(names) + "}'::TEXT[], false);" print "statement:\n", statement cursor.execute(statement) #print "executed" geoentities = [GeoEntity(row) for row in cursor.fetchall()] if admin1codes: geoentities = [g for g in geoentities if g.admin1code in admin1codes] print "filtering out geoentities that don't match admin1 code if there is an admin1 code match" for location in locations: if 'admin1code' in location: name = location['name'] admin1code = location['admin1code'] if admin1code: print "name:", name print "admin1code:", admin1code # are there any in geoentities that match matches = [] not_matches = [] for geoentity in geoentities: if geoentity.place_name == name or geoentity.alias == name: if geoentity.admin1code == admin1code: matches.append(geoentity) else: not_matches.append(geoentity) #print "matches:", matches #print "not_matches:", not_matches if matches: for geoentity in not_matches: geoentities.remove(geoentity) number_of_geoentities = len(geoentities) #print "geoentities", type(geoentities), len(geoentities) # calculate median distance from every other point #all_cords = [geoentity.point.coords for geoentity in geoentities] target_geoentities = defaultdict(list) target_coords = defaultdict(list) all_coords = [] for geoentity in geoentities: all_coords.append(geoentity.point.coords) target_geoentities[geoentity.target].append(geoentity) target_coords[geoentity.target].append(geoentity.point.coords) #number_of_clusters = max(3, number_of_locations/20) number_of_clusters = 3 print "number_of_clusters:", number_of_clusters #centroids = kmeans(all_coords, number_of_clusters)[0] #print "centroids:", centroids estimator = KMeans(n_clusters=number_of_clusters) estimator.fit(all_coords) labels = estimator.labels_ cluster_count = Counter() for cluster in labels: cluster_count[cluster] += 1 cluster_frequency = {cluster: float(count) / number_of_geoentities for cluster, count in cluster_count.iteritems() } for i in range(number_of_geoentities): geoentities[i].cluster_frequency = cluster_frequency[labels[i]] #print "target_geoentities:", len(target_geoentities) for target, options in target_geoentities.items(): #print "target:", target for i, v in enumerate(median(cdist(target_coords[target], all_coords), axis=1)): target_geoentities[target][i].median_distance_from_all_other_points = int(v) #print "name_topic names are", name_topic.keys() topic_id = name_topic[target] #print "topic:", topic_id for option in options: #print "\toption.topic_id:", option.topic_id option.matches_topic = option.topic_id == topic_id print "add probability to each geoentity" predict.run(geoentities) # need to choose one for each target based on highest probability for target, options in target_geoentities.items(): max_probability = max([o.probability for o in options]) found_correct = False for option in options: if not found_correct and option.probability == max_probability: option.correct = True found_correct = True else: option.correct = False #Feature, FeaturePlace featureplaces = [] for target, options in target_geoentities.items(): l = name_location[target] topic_id = name_topic[target] if target in name_topic else None count = l['count'] if 'count' in l else 1 feature = Feature.objects.create(count=count, name=target, geometry_used="Point", order_id=order_id, topic_id=topic_id, verified=False) need_to_save = False if "context" in l: feature.text = l['context'] need_to_save = True if "date" in l: feature.end = l['date'] feature.start = l['date'] need_to_save = True if need_to_save: feature.save() for option in options: featureplaces.append(FeaturePlace(confidence=float(option.probability), correct=option.correct, country_rank=option.country_rank, feature=feature, median_distance=option.median_distance_from_all_other_points, place_id=option.place_id, popularity=option.popularity)) FeaturePlace.objects.bulk_create(featureplaces) print "resolved locations for order " + str(order_id) print "took:", (datetime.now() - start).total_seconds() return len(featureplaces) > 0 except Exception as e: print e