def main(self): parser = argparse.ArgumentParser(description='modeling customer profile data with text') parser.add_argument('-g', action='store_true', help="generate customer and behavioral profile data") parser.add_argument('-a', action='store_true', help="analyze behavioral profile data") args = parser.parse_args() try: if args.g: generate_profile_data(100,50,2,10,100) elif args.a: behavioral_profiles = Yamlator.load("behavioral_profiles.yaml") customers = Yamlator.load("customers.yaml") analyze(behavioral_profiles, customers) except: traceback.print_exc()
def main(self): parser = argparse.ArgumentParser( description='modeling customer profile data with text') parser.add_argument( '-g', action='store_true', help="generate customer and behavioral profile data") parser.add_argument('-a', action='store_true', help="analyze behavioral profile data") args = parser.parse_args() try: if args.g: generate_profile_data(100, 50, 2, 10, 100) elif args.a: behavioral_profiles = Yamlator.load("behavioral_profiles.yaml") customers = Yamlator.load("customers.yaml") analyze(behavioral_profiles, customers) except: traceback.print_exc()
def generate_profile_data(num_customer_profiles, terms_per_profile, min_products_purchased_per_customer, max_products_purchased_per_customer, num_products_per_behavioral_profile): """ Generates customer and behavioral profile data. """ #builds term/count map by genre genre_freq_ass_map = dict() print "building user product data set" for f in glob.glob("seed/*.txt"): genre = f.split("/")[1].split("-")[0] if genre not in genre_freq_ass_map: genre_freq_ass_map[genre] = FrequencyAssessor() with open(f, "r") as current_file: text = current_file.read() tokens = nltk.word_tokenize(text) for t in tokens: if t not in string.punctuation and len(t) > 2: if t.endswith("."): t = t[:len(t) - 1] genre_freq_ass_map[genre].update(t.lower()) #build genre>list of terms freq of occurance weighted. genre_terms_list = dict() for g in genre_freq_ass_map.keys(): genre_terms_list[g] = [] for t in genre_freq_ass_map[g].get_top_terms(max=500): for i in range(t[1]): genre_terms_list[g].append(t[0]) #build customer profiles customer_profiles = [] for g in genre_terms_list.keys(): cid = 0 for c in range(num_customer_profiles): product_descriptions = [] pid = 0 for p in range( random.randint(min_products_purchased_per_customer, max_products_purchased_per_customer)): product_description = [] for i in range(terms_per_profile): word_index = random.randint(0, len(genre_terms_list[g]) - 1) product_description.append(genre_terms_list[g][word_index]) z = 0 while z < 4: #pop in some random words ii = random.randint(0, len(genre_terms_list.keys()) - 1) random_genre = genre_terms_list.keys()[ii] word_index = random.randint( 0, len(genre_terms_list[random_genre]) - 1) product_description.append( genre_terms_list[random_genre][word_index]) z += 1 pid += 1 product_descriptions.append( Product(pid, ' '.join(product_description))) cid += 1 customer_profiles.append(Customer(g, cid, product_descriptions)) Yamlator.dump("customers.yaml", customer_profiles) #build behavioral_profiles behavioral_profiles = [] for g in genre_terms_list.keys(): product_descriptions = [] pid = 0 for p in range(num_products_per_behavioral_profile): product_description = [] for i in range(terms_per_profile): word_index = random.randint(0, len(genre_terms_list[g]) - 1) product_description.append(genre_terms_list[g][word_index]) pid += 1 product_descriptions.append( Product(pid, ' '.join(product_description))) behavioral_profiles.append(BehavioralProfile(g, product_descriptions)) Yamlator.dump("behavioral_profiles.yaml", behavioral_profiles)
def generate_profile_data(num_customer_profiles, terms_per_profile, min_products_purchased_per_customer, max_products_purchased_per_customer, num_products_per_behavioral_profile): """ Generates customer and behavioral profile data. """ #builds term/count map by genre genre_freq_ass_map=dict() print "building user product data set" for f in glob.glob("seed/*.txt"): genre = f.split("/")[1].split("-")[0] if genre not in genre_freq_ass_map: genre_freq_ass_map[genre]=FrequencyAssessor() with open (f, "r") as current_file: text = current_file.read() tokens = nltk.word_tokenize(text) for t in tokens: if t not in string.punctuation and len(t) > 2: if t.endswith("."): t=t[:len(t)-1] genre_freq_ass_map[genre].update(t.lower()) #build genre>list of terms freq of occurance weighted. genre_terms_list=dict() for g in genre_freq_ass_map.keys(): genre_terms_list[g]=[] for t in genre_freq_ass_map[g].get_top_terms(max=500): for i in range(t[1]): genre_terms_list[g].append(t[0]) #build customer profiles customer_profiles=[] for g in genre_terms_list.keys(): cid=0 for c in range(num_customer_profiles): product_descriptions=[] pid=0 for p in range(random.randint(min_products_purchased_per_customer, max_products_purchased_per_customer)): product_description=[] for i in range(terms_per_profile): word_index = random.randint(0,len(genre_terms_list[g])-1) product_description.append(genre_terms_list[g][word_index]) z=0 while z<4: #pop in some random words ii = random.randint(0,len(genre_terms_list.keys())-1) random_genre = genre_terms_list.keys()[ii] word_index = random.randint(0,len(genre_terms_list[random_genre])-1) product_description.append(genre_terms_list[random_genre][word_index]) z+=1 pid+=1 product_descriptions.append(Product(pid, ' '.join(product_description))) cid+=1 customer_profiles.append(Customer(g, cid, product_descriptions)) Yamlator.dump("customers.yaml", customer_profiles) #build behavioral_profiles behavioral_profiles=[] for g in genre_terms_list.keys(): product_descriptions=[] pid=0 for p in range(num_products_per_behavioral_profile): product_description=[] for i in range(terms_per_profile): word_index = random.randint(0,len(genre_terms_list[g])-1) product_description.append(genre_terms_list[g][word_index]) pid+=1 product_descriptions.append(Product(pid, ' '.join(product_description))) behavioral_profiles.append(BehavioralProfile(g, product_descriptions)) Yamlator.dump("behavioral_profiles.yaml", behavioral_profiles)