Beispiel #1
0
  def do_guess_normals(self, normals, top=1):
    if not normals:
      return []

    smart_print(normals, "in guess normals")
    available = normals[0: top]
    normal_tags_dict = {}
    dict_from_items(normal_tags_dict, available)
    parents_list = []

    for normal_item, value in available:
      parents_list.extend(self.memory_normal_items.normal_all_parents(normal_item.slug, value))
    smart_print(parents_list, "parent_list")
    dict_from_items(normal_tags_dict, parents_list)

    top_normals = rank_dict(normal_tags_dict, top=1)
    for key, _ in normal_tags_dict.items():
      relations = []
      for top_normal, _ in top_normals:
        relation = top_normal == key or NormalItemEntity.direct_relation(key, top_normal)
        relations.append(relation)
      if not any(relations):
        del normal_tags_dict[key]

    return rank_dict(normal_tags_dict, top=top)
Beispiel #2
0
  def normal_update(self, **kwargs):
    smart_print(kwargs, 'normal_update')
    try:
      slug = kwargs.get('slug', '')
      if not slug:
        return {'success': False}

      Normal.cls_update(**kwargs)
    except Exception:
      return {'success': False}
    else:
      return {'success': True}
Beispiel #3
0
  def clusters(self, tags, origin, top_n=1): # 还没用到所有的分词和限制个数 TODO
    smart_print(tags, 'cluster tags')
    guessed_places, guessed_cities, guessed_normal = self.guess(tags, top_n)
    smart_print(guessed_places, "guessed places")
    smart_print(guessed_places, "guessed cities")
    smart_print(guessed_normal, "guessed normal")
    places = {}
    others = {}

    for key, value in guessed_places:
      if key in places:
        places[key] += value
      else:
        places.update({
          key: value
        })

    for key, value in guessed_normal:
      if key in others:
        others[key] += value
      else:
        others.update({
          key: value
        })
    return places, others
Beispiel #4
0
  def place_all_parents(self, slug, value, increment=0.1):
    smart_print(slug, "in place_all_parents")
    exists = self.exists(slug)
    if not exists:
      return [], [], []

    origin = self.get(slug)
    if origin.category == 'NORMAL':
      return [], [], []

    countries = []
    cities = []
    places = []
    if origin.category == 'PLACE':
      places.append((origin, value))
    elif origin.category == 'AREA':
      cities.append((origin, value))
    else:
      countries.append((origin, value))

    value += increment
    for place, _ in places:
      parent = self.place_parent(place)
      if not parent:
        continue

      cities.append((parent, value))
    value += increment

    for city, _ in cities:
      parent = self.place_parent(city)
      if not parent:
        continue

      countries.append((parent, value))

    return places, cities, countries
Beispiel #5
0
  def parse(self, words, weight=1, TF_IDF=True):
    if not isinstance(words, basestring):
      return []

    results = []

    smart_print(words)
    words = re.sub('\s', ENGLISH_SEGMENT_SEPARATOR, words)
    smart_print(words)
    words = to_str(words)
    for token in self.seg.seg_txt(words):
      token = token.decode('utf-8')
      token = re.sub('Z+', ' ', token).strip()
      if self.is_keyword(token):
        results.append(token)

    d = {}
    for r in results:
      if r in d:
        d[r] += weight * self.keywords.get(r, 1) if TF_IDF else weight
      else:
        d[r] = weight * self.keywords.get(r, 1) if TF_IDF else weight

    return d
Beispiel #6
0
  def do_guess_places(self, countries, cities, places, top=1):
    countries_dict = {}
    cities_dict = {}
    places_dict = {}
    dict_from_items(countries_dict, countries)
    dict_from_items(cities_dict, cities)
    dict_from_items(places_dict, places)
    smart_print(countries_dict, "countries_dict")
    smart_print(cities_dict, "cities_dict")
    smart_print(places_dict, "places_dict")

    top_country = rank_dict(countries_dict, top=1)
    available_countries = [country for country, _ in top_country]

    top_cities = rank_dict(cities_dict, top=top)
    top_city = []
    for city, value in top_cities:
      relations = []
      for available_country in available_countries:
        relation = PlaceItemEntity.direct_relation(city, available_country)
        relations.append(relation)
      if any(relations):
        top_city.append((city, value))
        break

    available_cities = [city for city, _ in top_city]
    top_places = []

    for place, value in places:
      relations = []
      for city in available_cities:
        relation = PlaceItemEntity.direct_relation(place, city)
        relations.append(relation)
      if any(relations):
        top_places.append((place, value))

    if not available_countries:
      return cities_dict.items(), cities_dict.items()

    if not top_places and not top_city:
      return top_country, []
    else:
      result = []
      result.extend(top_places[: top-1])
      result.extend(top_city)
      return result, top_city
Beispiel #7
0
 def update(self, **kwargs):
   smart_print(kwargs, "before update")
   self.remove(**kwargs)
   self.add(**kwargs)
   return {'success': True}
Beispiel #8
0
  def guess(self, tags, top_n=1):
    countries = []
    cities = []
    places = []
    normals = []

    smart_print(tags, "in guess tags")
    for tag, value in tags:
      items = tag.items
      smart_print(items, "tag_items")
      for item in items:
        category = item.category
        if category == 'NORMAL':
          normals.append((item, value))
        else:
          item_places, item_cities, item_countries \
          = self.memory_place_items.place_all_parents(item.slug, value)
          countries.extend(item_countries)
          cities.extend(item_cities)
          places.extend(item_places)

      for parent_name in tag.parents:
        parent = self.memory_normal_items.get(parent_name)
        if not parent:
          continue
        else:
          normals.append((parent, value))

    smart_print(countries, "countries")
    smart_print(cities, 'cities')
    smart_print(places, 'places')
    smart_print(normals, 'normals')
    guessed_places, guessed_cities = self.do_guess_places(countries, cities, places, top_n)
    smart_print(guessed_places, "guessed places")
    smart_print(guessed_cities, "guessed cities")
    guessed_normals = self.do_guess_normals(normals, top_n)
    smart_print(guessed_normals, "guessed normals")
    return guessed_places, guessed_cities, guessed_normals
Beispiel #9
0
  def rank(self):
    tags_list = []
    for content in self.contents:
      tags = self.parse(content['content'], content.get('weight', 1))
      tags_list.append(tags) # 根据内容和权重分词  {'content': '我爱北京', 'weight': 5}
    smart_print(tags_list, "分词结果")
    # 将分词结果聚合
    tags_dict = self.aggregation(*tags_list)  # {'北京': 5}
    smart_print(tags_dict, "标签聚合")
    filtered_tags = self.filter(tags_dict)
    smart_print(filtered_tags, "标签filter")

    top_n_tags_list = self.ranking(filtered_tags, top=10) # 将聚合结果排名
    smart_print(top_n_tags_list, "排名前N")
    places, others = self.clusters(top_n_tags_list, filtered_tags, top_n=TOP_N) # 根据权重得出最权威的places和其他信息
    smart_print(places, "地点")
    smart_print(others, "其他")

    result = self.format(places, others)
    smart_print(result, "结果")
    return result # 返回formatted的数据