Beispiel #1
0
  def build(self,skip_cdiscount=False):
    prices={}
    l=[]
    spam_reader = parser(self.path)
    spam_reader.next()
    self.reset_count(self.train_len)
    
    print "computing prices dictionary and prices list"

    for row in spam_reader:
      if float(row[self.price_position])<=0:
        continue
      price = self.transform(float(row[self.price_position]))
      if smart_in(prices,price):
        if smart_in(prices[price],row[self.c3_position]):
          prices[price][row[self.c3_position]] += 1
        else:
          prices[price][row[self.c3_position]] = 1
        prices[price]['total']+=1
      else:
        l.append(price)
        prices[price] = {row[self.c3_position] : 1,'total' : 1}
      self.smart_count()
      if self.loop_break:
        break

    l.sort() 
    self.prices = prices
    self.p_list = l
    self.build_max_prices()
Beispiel #2
0
    def build(self):

        brands = {}

        spam_reader = parser(self.path)

        print "computing brands dictionary"

        self.reset_count(self.train_len)

        spam_reader.next()
        for row in spam_reader:
            self.smart_count()
            brand = self.normalized_brand(row[self.brand_position])

            if self.skip_cdiscount_function(row):
                continue
            if smart_in(brands, brand):
                if smart_in(brands[brand], row[self.c3_position]):
                    brands[brand][row[self.c3_position]] += 1
                else:
                    brands[brand][row[self.c3_position]] = 1
                brands[brand]["total"] += 1
            else:
                brands[brand] = {row[self.c3_position]: 1, "total": 1}

            if self.loop_break:
                break

        self.brands = brands
        self.build_max_brands()
Beispiel #3
0
    def build(self, skip_cdiscount=False):
        prices = {}
        l = []
        spam_reader = parser(self.path)
        spam_reader.next()
        self.reset_count(self.train_len)

        print "computing prices dictionary and prices list"

        for row in spam_reader:
            if float(row[self.price_position]) <= 0:
                continue
            price = self.transform(float(row[self.price_position]))
            if smart_in(prices, price):
                if smart_in(prices[price], row[self.c3_position]):
                    prices[price][row[self.c3_position]] += 1
                else:
                    prices[price][row[self.c3_position]] = 1
                prices[price]['total'] += 1
            else:
                l.append(price)
                prices[price] = {row[self.c3_position]: 1, 'total': 1}
            self.smart_count()
            if self.loop_break:
                break

        l.sort()
        self.prices = prices
        self.p_list = l
        self.build_max_prices()
Beispiel #4
0
    def build(self):

        brands = {}

        spam_reader = parser(self.path)

        print "computing brands dictionary"

        self.reset_count(self.train_len)

        spam_reader.next()
        for row in spam_reader:
            self.smart_count()
            brand = self.normalized_brand(row[self.brand_position])

            if self.skip_cdiscount_function(row):
                continue
            if smart_in(brands, brand):
                if smart_in(brands[brand], row[self.c3_position]):
                    brands[brand][row[self.c3_position]] += 1
                else:
                    brands[brand][row[self.c3_position]] = 1
                brands[brand]['total'] += 1
            else:
                brands[brand] = {row[self.c3_position]: 1, 'total': 1}

            if self.loop_break:
                break

        self.brands = brands
        self.build_max_brands()
Beispiel #5
0
  def compute_category(self,item):
    t = Timer()
    t.pick("debut")
    voc = self.voc_from_item(item)
    if self.product:
      voc = self.word_product(voc)
    voc_dic = self.word_dic_from_list(voc)
    best_score = 0
    best_cat = 1000009411
    t.pick("vocabulaire construit")
    cat_set = set()

    for word in voc_dic["dic"].keys():
      try:
        cat_set = cat_set.union(self.word_cats_dict[word])
        #print 'word : "%s", count : %s ' % (word,len(self.word_cats_dict[word]))
      except KeyError:
        #Si on ne peut pas trouver de catégorie pour ce mot, c'est qu'il n'a jamais été trouvé 
        # dans le set train
        pass
    #print "total : %s " % (len(cat_set) ,)
    #print len(voc_dic["dic"])
    for cat in cat_set:
      score = 0
      for word in voc_dic["dic"].keys():
        if smart_in(self.centroids[cat],word):
          score += self.centroids[cat][word]*voc_dic["dic"][word]
      if score > best_score:
        best_score = score
        best_cat = cat
    if best_score == 0:
      #print "nothing found"
      pass
    t.pick("best_cat chope")
    return best_cat
Beispiel #6
0
    def compute_category(self, item):
        b = self.model_brand
        p = self.model_price
        if self.train:
            brand_position = self.brand_position
            price_position = self.price_position
        else:
            brand_position = self.brand_position_test
            price_position = self.price_position_test
        no_brand = NO_BRAND
        if not smart_in(b.brands, item[brand_position]):
            brand = no_brand
        else:
            brand = item[brand_position]

        price = float(item[price_position])

        if price < 0:
            cat = b.cat_from_brand(brand)
        else:
            price = p.transform(price)
            prix = None
            prix = find_nearest(p.p_list, price)
            price = prix
            if b.proba[brand]['proba'] > p.proba[price][
                    'proba'] and brand != no_brand:
                cat = b.cat_from_brand(brand)
            else:
                cat = p.cat_from_price(price)
        return cat
Beispiel #7
0
  def compute_category(self,item):
    b=self.model_brand
    p=self.model_price
    if self.train:
      brand_position = self.brand_position
      price_position = self.price_position
    else:
      brand_position = self.brand_position_test
      price_position = self.price_position_test
    no_brand = NO_BRAND
    if not smart_in(b.brands,item[brand_position]):
      brand = no_brand
    else:
      brand = item[brand_position]
    
    price = float(item[price_position])

    if price<0: 
      cat=b.cat_from_brand(brand)
    else:
      price = p.transform(price)
      prix = None 
      prix = find_nearest(p.p_list,price)
      price=prix
      if b.proba[brand]['proba']>p.proba[price]['proba'] and brand!=no_brand:
        cat=b.cat_from_brand(brand)
      else:
        cat=p.cat_from_price(price)
    return cat 
Beispiel #8
0
 def word_dic_from_list(self,word_list):
   dic = {"dic":{}, "total" : 0 }
   for word in word_list:
     if smart_in(dic["dic"],word):
       dic["dic"][word] += 1
       dic["total"] += 1
     else:
       dic["dic"][word] = 1
       dic["total"] += 1
   return dic
Beispiel #9
0
 def compute_category(self, item):
     # Core function, associating an item with a category
     # item is a vector just read from the file
     if self.train:
         brand_position = self.brand_position
     else:
         brand_position = self.brand_position_test
     no_brand = NO_BRAND
     if not smart_in(self.brands, item[brand_position]):
         brand = no_brand
     else:
         brand = item[brand_position]
     cat = self.cat_from_brand(brand)
     return cat
Beispiel #10
0
 def compute_category(self, item):
     #Core function, associating an item with a category
     #item is a vector just read from the file
     if self.train:
         brand_position = self.brand_position
     else:
         brand_position = self.brand_position_test
     no_brand = NO_BRAND
     if not smart_in(self.brands, item[brand_position]):
         brand = no_brand
     else:
         brand = item[brand_position]
     cat = self.cat_from_brand(brand)
     return cat
Beispiel #11
0
  def build(self):


    spam_reader = parser(self.path)

    data = {}

    print "computing brands dictionary"

    self.reset_count(self.train_len)

    spam_reader.next()
    for row in spam_reader:
      self.smart_count()
      desc = row[self.desc_position]

      if self.skip_cdiscount_function(row):
        continue
      if smart_in(data,row[self.c3_position]):
        data[row[self.c3_position]].append(desc)
      else:
        data[row[self.c3_position]] = [desc]

      if self.loop_break:
        break

    final_data = {}
    print len(data["1000015309"])

    for cat_id in data.keys():
      final_text = ''
      for text in data[cat_id]:
        final_text += text
      final_data[cat_id] = final_text
      print final_text

    vectorizer = TfidfVectorizer()
    vectorizer.fit_transform(map(lambda x : final_data[x], final_data.keys()))
    return vectorizer
Beispiel #12
0
 def add_word(self, word, n=1):
     if smart_in(self.dic, word):
         self.dic[word] += n
     else:
         self.dic[word] = n
     self.total += n
Beispiel #13
0
  def build(self):

    self.word_cats_dict = {}
    cats  = {} 
    cat_count = {}

    spam_reader = parser(self.path)

    print "computing category dictionary"

    self.reset_count(self.train_len)

    spam_reader.next()
    for row in spam_reader:

      cat   = row[self.c3_position]

      if self.skip_cdiscount_function(row):
        continue
      if self.skip_book_function(row):
        continue
      if self.cat_count is not None:
        if not smart_in(cat_count,cat):
          cat_count[cat] = 1
        else:
          if cat_count[cat] > self.cat_count:
            continue
          else:
            cat_count[cat] += 1

      self.smart_count()

      if self.loop_break:
        break


      voc = self.voc_from_item(row,train=True)

      for word in voc:
        if smart_in(self.word_cats_dict,word):
          if cat not in self.word_cats_dict[word]:
            self.word_cats_dict[word].append(cat)
        else:
          self.word_cats_dict[word] = [cat]
      if self.product:
        voc = self.word_product(voc)
      for word in voc:
        if smart_in(cats,cat):
          cats[cat].add_word(word)
        else:
          cats[cat] = WordDic()
          cats[cat].add_word(word)

      del(voc)

    print len(self.word_cats_dict)
    new_dict = {}
    for word in self.word_cats_dict:
      if not len(self.word_cats_dict[word])==1:
        new_dict[word] = self.word_cats_dict[word]
      else:
        if cats[next(iter(self.word_cats_dict[word]))].del_word(word):
          new_dict[word] = self.word_cats_dict[word]

    self.word_cats_dict = new_dict

    print len(self.word_cats_dict)


    print "size of word_cats : %s " % (sys.getsizeof(new_dict)*sys.getsizeof(new_dict[next(iter(new_dict))]))
    print "size of cats : %s " % (sys.getsizeof(cats)*sys.getsizeof(next(iter(cats))))
    self.cats = cats
    self.build_centroids()
Beispiel #14
0
 def add_word(self,word,n=1):
   if smart_in(self.dic,word):
     self.dic[word] += n
   else:
     self.dic[word] = n
   self.total += n