Ejemplo n.º 1
0
def run(places=None):

    try:

        start = datetime.now()

        if places is None:

            # will probably have to do a yield thing at some point
            #places = Place.objects.exclude(featureplace=None, featureplace__correct=True, featureplace__feature__verified=True)
            places = Place.objects.exclude(featureplace=None)

            place_ids = Feature.objects.filter(
                featureplace__correct=True,
                featureplace__feature__verified=True).values_list(
                    "featureplace__place_id", flat=True)

        print("place_ids:", len(place_ids))
        for place_id in place_ids:

            counter = Counter()
            for feature in Feature.objects.filter(
                    verified=True,
                    featureplace__place_id=place_id,
                    featureplace__correct=True).exclude(text=None).exclude(
                        text=""):
                if feature.text:
                    topic_id = get_topic(feature.text)
                    if topic_id:
                        counter[topic_id] += 1
                    else:
                        print(feature.text)

            print("counter:", counter)
            most_common_topics = counter.most_common(1)
            if most_common_topics:
                most_common_topic_tuple = most_common_topics[0]
                if most_common_topic_tuple:
                    most_common_topic_id = most_common_topic_tuple[0]
                    print("\tmost_common_topic for",
                          Place.objects.get(id=place_id), "is",
                          Topic.objects.get(id=most_common_topic_id).name)

        print("took", (datetime.now() - start).total_seconds(), "seconds")

    except Exception as e:

        print(e)
Ejemplo n.º 2
0
def run(places=None):

    try:

        start = datetime.now()

        if places is None:

            # will probably have to do a yield thing at some point
            #places = Place.objects.exclude(featureplace=None, featureplace__correct=True, featureplace__feature__verified=True)
            places = Place.objects.exclude(featureplace=None)

            place_ids = Feature.objects.filter(featureplace__correct=True, featureplace__feature__verified=True).values_list("featureplace__place_id", flat=True)

        print "place_ids:", len(place_ids)
        for place_id in place_ids:

            counter = Counter()
            for feature in Feature.objects.filter(verified=True, featureplace__place_id=place_id, featureplace__correct=True).exclude(text=None).exclude(text=""):
                if feature.text:
                    topic_id = get_topic(feature.text)
                    if topic_id:
                        counter[topic_id] += 1
                    else:
                        print feature.text

            print "counter:", counter
            most_common_topics = counter.most_common(1)
            if most_common_topics:
                most_common_topic_tuple = most_common_topics[0]
                if most_common_topic_tuple:
                    most_common_topic_id = most_common_topic_tuple[0]
                    print "\tmost_common_topic for", Place.objects.get(id=place_id), "is", Topic.objects.get(id=most_common_topic_id).name

        print "took", (datetime.now() - start).total_seconds(), "seconds"

    except Exception as e:

        print e
Ejemplo n.º 3
0
def resolve_locations(locations,
                      order_id,
                      max_seconds=10,
                      countries=[],
                      admin1codes=[],
                      debug=True,
                      end_user_timezone=None,
                      case_insensitive=None):
    try:
        print("starting resolve_locations with", type(locations))
        print("" * 4, "locations = ", len(locations), locations[:5])
        print("" * 4, "countries:", countries)
        print("" * 4, "admin1codes:", admin1codes)
        print("" * 4, "end_user_timezone:", end_user_timezone)
        print("" * 4, "case_insensitive:", case_insensitive)

        start = datetime.now()

        order = Order.objects.get(id=order_id)

        # make more resilient to unidecode

        name_country = {}
        name_country_code = {}
        name_location = {}
        name_topic = {}
        names = []
        normalized_names = set()
        for location in locations:
            #cleaning name a little just to play it safe; sometimes have blank depending on extract method
            name = location['name'] = location['name'].strip()
            try:
                print("name:", name)
            except Exception as e:
                pass
            #skipping over places with commas and parentheses in them.. because won't find in db anyway... probably need more longterm solution like escape quoting in psql
            if name and not any(
                    char in name
                    for char in [',', ')', '(', '?', "'", '"', "}", "{"]):
                names.append(name)
                normalized = normalize(name)
                normalized_names.add(normalized)
                name_location[normalized] = location
                if location.get('country', None):
                    name_country[normalized] = location['country']
                if location.get('country_code', None):
                    name_country_code[normalized] = location['country_code']
                if "context" in location:
                    topic_id = get_topic(location['context'])
                    location['topic_id'] = topic_id
                    name_topic[normalized] = topic_id
                else:
                    name_topic[normalized] = None

        number_of_locations = len(locations)
        print("number_of_locations:", number_of_locations)

        #print "names", len(names), names[:5]

        # randomize order in order to minimize statistical bias
        shuffle(names)

        names = list(set(names))
        try:
            print("names:", names)
        except UnicodeEncodeError:
            print("couldn't print statement because non-ascii")

        places = Place.objects.filter(
            name_normalized__in=normalized_names).values()

        number_of_places = len(places)

        if number_of_places == 0:
            return False

        print("places", type(places), len(places))

        copy_prop(places, "name_normalized", "feature_id")

        print("set places", len(places))
        places = marge_utils.to_dicts(marge_resolver.resolve(places))
        print("MARGE resolved:", len(places))

        maxes = marge_utils.max_by_group(places, "score", "feature_id")
        print("maxes:", maxes)

        for option in places:
            fid = option["feature_id"]
            prob = option["score"]
            option["correct"] = prob == maxes[fid]
        print("resolver SET CORRECT")

        # recomposing target places
        target_places = group_into_dict(places, "name_normalized")
        print("recomposed:", list(target_places.keys()))

        #Feature, FeaturePlace
        featureplaces = []
        for target, options in list(target_places.items()):
            print("target:", target)
            print("\toptions:", options)
            l = name_location[target]
            topic_id = name_topic.get(target, None)
            count = l['count'] if 'count' in l else 1
            correct_option = next(option for option in options
                                  if option["correct"])
            geometry_used = "Shape" if correct_option["mpoly"] else "Point"
            feature = Feature.objects.create(count=count,
                                             name=l["name"],
                                             geometry_used=geometry_used,
                                             order_id=order_id,
                                             topic_id=topic_id,
                                             verified=False)
            need_to_save = False
            if "context" in l:
                feature.text = l['context']
                need_to_save = True
            if "date" in l:
                feature.end = l['date']
                feature.start = l['date']
                need_to_save = True
            if need_to_save:
                feature.save()
            for option in options:
                featureplaces.append(
                    FeaturePlace(confidence=option["score"],
                                 correct=bool(option["correct"]),
                                 feature=feature,
                                 place_id=option["id"],
                                 sort_order=-1))

        FeaturePlace.objects.bulk_create(featureplaces)

        print("resolved locations for order " + str(order_id))

        print("took:", (datetime.now() - start).total_seconds())

        return len(featureplaces) > 0

    except Exception as e:
        print(e)
Ejemplo n.º 4
0
def resolve_locations(locations, order_id, max_seconds=10, countries=[], admin1codes=[]):
  try:
    print "starting resolve_locations with", type(locations)
    print "locations = ", len(locations), locations[:5]
    print "countries:", countries
    print "admin1codes:", admin1codes

    start = datetime.now()

  
    order = Order.objects.get(id=order_id)

    name_location = {}
    name_topic = {}
    names = []
    for location in locations:
        #cleaning name a little just to play it safe; sometimes have blank depending on extract method
        name = location['name'] = location['name'].strip()
        #skipping over places with commas and parentheses in them.. because won't find in db anyway... probably need more longterm solution like escape quoting in psql
        if name and not any(char in name for char in [',', ')', '(', '?', "'", '"']): 
            names.append(name)
            name_location[name] = location
            if "context" in location:
                topic_id = get_topic(location['context'])
                location['topic_id'] = topic_id
                name_topic[name] = topic_id
            else:
                name_topic[name] = None


    number_of_locations = len(locations)

    #print "names", len(names), names[:5]

    # randomize order in order to minimize statistic bias
    shuffle(names)

    names = set(names)
    print "names:", names

    cursor = connection.cursor()

    seconds_left = max_seconds - (datetime.now().replace(tzinfo=UTC) - order.start).total_seconds() 
    print "seconds_left:", seconds_left
    if seconds_left > 60:
        if countries:
            statement = "SELECT * FROM fdgis_resolve_with_countries('{" + ", ".join(names) + "}'::TEXT[], '{" + ", ".join(countries) + "}'::TEXT[], true);"
        else:
            statement = "SELECT * FROM fdgis_resolve('{" + ", ".join(names) + "}'::TEXT[], true);"
    else:
        if countries:
            statement = "SELECT * FROM fdgis_resolve_with_countries('{" + ", ".join(names) + "}'::TEXT[], '{" + ", ".join(countries) + "}'::TEXT[], false);"
        else:
            statement = "SELECT * FROM fdgis_resolve('{" + ", ".join(names) + "}'::TEXT[], false);"



    print "statement:\n", statement
    cursor.execute(statement)
    #print "executed"

    geoentities = [GeoEntity(row) for row in cursor.fetchall()]

    if admin1codes:
        geoentities = [g for g in geoentities if g.admin1code in admin1codes]


    print "filtering out geoentities that don't match admin1 code if there is an admin1 code match"
    for location in locations:
        if 'admin1code' in location:
            name = location['name']
            admin1code = location['admin1code'] 
            if admin1code:
                print "name:", name
                print "admin1code:", admin1code
                # are there any in geoentities that match
                matches = []
                not_matches = []
                for geoentity in geoentities:
                    if geoentity.place_name == name or geoentity.alias == name:
                        if geoentity.admin1code == admin1code:
                            matches.append(geoentity)
                        else:
                            not_matches.append(geoentity)
                #print "matches:", matches
                #print "not_matches:", not_matches
                if matches:
                    for geoentity in not_matches:
                        geoentities.remove(geoentity)

    number_of_geoentities = len(geoentities)

    #print "geoentities", type(geoentities), len(geoentities)

    # calculate median distance from every other point
    #all_cords = [geoentity.point.coords for geoentity in geoentities]

    target_geoentities = defaultdict(list)
    target_coords = defaultdict(list)
    all_coords = []
    for geoentity in geoentities:
        all_coords.append(geoentity.point.coords)
        target_geoentities[geoentity.target].append(geoentity)
        target_coords[geoentity.target].append(geoentity.point.coords)

    #number_of_clusters =  max(3, number_of_locations/20)
    number_of_clusters = 3
    print "number_of_clusters:", number_of_clusters
    #centroids = kmeans(all_coords, number_of_clusters)[0]
    #print "centroids:", centroids
    estimator = KMeans(n_clusters=number_of_clusters)
    estimator.fit(all_coords)
    labels = estimator.labels_
    cluster_count = Counter()
    for cluster in labels:
        cluster_count[cluster] += 1
    cluster_frequency = {cluster: float(count) / number_of_geoentities for cluster, count in cluster_count.iteritems() }
    for i in range(number_of_geoentities):
        geoentities[i].cluster_frequency = cluster_frequency[labels[i]]
    

    #print "target_geoentities:", len(target_geoentities)
    for target, options in target_geoentities.items():
        #print "target:", target
        for i, v in enumerate(median(cdist(target_coords[target], all_coords), axis=1)):
            target_geoentities[target][i].median_distance_from_all_other_points = int(v)
       
        #print "name_topic names are", name_topic.keys() 
        topic_id = name_topic[target]
        #print "topic:", topic_id
        for option in options:
            #print "\toption.topic_id:", option.topic_id
            option.matches_topic = option.topic_id == topic_id

    print "add probability to each geoentity"
    predict.run(geoentities)

    # need to choose one for each target based on highest probability
    for target, options in target_geoentities.items():
        max_probability = max([o.probability for o in options])
        found_correct = False
        for option in options:
            if not found_correct and option.probability == max_probability:
                option.correct = True
                found_correct = True
            else:
                option.correct = False
 
    #Feature, FeaturePlace
    featureplaces = [] 
    for target, options in target_geoentities.items():
        l = name_location[target]
        topic_id = name_topic[target] if target in name_topic else None
        count = l['count'] if 'count' in l else 1
        feature = Feature.objects.create(count=count, name=target, geometry_used="Point", order_id=order_id, topic_id=topic_id, verified=False) 
        need_to_save = False
        if "context" in l:
            feature.text = l['context']
            need_to_save = True
        if "date" in l:
            feature.end = l['date']
            feature.start = l['date']
            need_to_save = True
        if need_to_save:
            feature.save()
        for option in options:
            featureplaces.append(FeaturePlace(confidence=float(option.probability), correct=option.correct, country_rank=option.country_rank, feature=feature, median_distance=option.median_distance_from_all_other_points, place_id=option.place_id, popularity=option.popularity))

    FeaturePlace.objects.bulk_create(featureplaces)

    print "resolved locations for order " + str(order_id)

    print "took:", (datetime.now() - start).total_seconds()

    return len(featureplaces) > 0

  except Exception as e:
    print e