Ejemplo n.º 1
0
    def main(self):


        parser = argparse.ArgumentParser(description='modeling customer profile data with text')
        parser.add_argument('-g', action='store_true', help="generate customer and behavioral profile data")
        parser.add_argument('-a', action='store_true', help="analyze behavioral profile data")

        args = parser.parse_args()

        try:
            if args.g:
                generate_profile_data(100,50,2,10,100)
            elif args.a:
                behavioral_profiles = Yamlator.load("behavioral_profiles.yaml")
                customers = Yamlator.load("customers.yaml")
                analyze(behavioral_profiles, customers)

        except:
            traceback.print_exc()
Ejemplo n.º 2
0
    def main(self):

        parser = argparse.ArgumentParser(
            description='modeling customer profile data with text')
        parser.add_argument(
            '-g',
            action='store_true',
            help="generate customer and behavioral profile data")
        parser.add_argument('-a',
                            action='store_true',
                            help="analyze behavioral profile data")

        args = parser.parse_args()

        try:
            if args.g:
                generate_profile_data(100, 50, 2, 10, 100)
            elif args.a:
                behavioral_profiles = Yamlator.load("behavioral_profiles.yaml")
                customers = Yamlator.load("customers.yaml")
                analyze(behavioral_profiles, customers)

        except:
            traceback.print_exc()
Ejemplo n.º 3
0
def generate_profile_data(num_customer_profiles, terms_per_profile,
                          min_products_purchased_per_customer,
                          max_products_purchased_per_customer,
                          num_products_per_behavioral_profile):
    """
    Generates customer and behavioral profile data.
    """

    #builds term/count map by genre
    genre_freq_ass_map = dict()
    print "building user product data set"
    for f in glob.glob("seed/*.txt"):
        genre = f.split("/")[1].split("-")[0]
        if genre not in genre_freq_ass_map:
            genre_freq_ass_map[genre] = FrequencyAssessor()

        with open(f, "r") as current_file:
            text = current_file.read()
            tokens = nltk.word_tokenize(text)
            for t in tokens:
                if t not in string.punctuation and len(t) > 2:
                    if t.endswith("."):
                        t = t[:len(t) - 1]

                    genre_freq_ass_map[genre].update(t.lower())

    #build genre>list of terms freq of occurance weighted.
    genre_terms_list = dict()
    for g in genre_freq_ass_map.keys():
        genre_terms_list[g] = []
        for t in genre_freq_ass_map[g].get_top_terms(max=500):
            for i in range(t[1]):
                genre_terms_list[g].append(t[0])

    #build customer profiles
    customer_profiles = []
    for g in genre_terms_list.keys():
        cid = 0
        for c in range(num_customer_profiles):
            product_descriptions = []
            pid = 0
            for p in range(
                    random.randint(min_products_purchased_per_customer,
                                   max_products_purchased_per_customer)):
                product_description = []
                for i in range(terms_per_profile):
                    word_index = random.randint(0,
                                                len(genre_terms_list[g]) - 1)
                    product_description.append(genre_terms_list[g][word_index])

                    z = 0
                    while z < 4:
                        #pop in some random words
                        ii = random.randint(0,
                                            len(genre_terms_list.keys()) - 1)
                        random_genre = genre_terms_list.keys()[ii]
                        word_index = random.randint(
                            0,
                            len(genre_terms_list[random_genre]) - 1)
                        product_description.append(
                            genre_terms_list[random_genre][word_index])
                        z += 1
                pid += 1
                product_descriptions.append(
                    Product(pid, ' '.join(product_description)))
            cid += 1
            customer_profiles.append(Customer(g, cid, product_descriptions))

    Yamlator.dump("customers.yaml", customer_profiles)

    #build behavioral_profiles
    behavioral_profiles = []
    for g in genre_terms_list.keys():
        product_descriptions = []
        pid = 0
        for p in range(num_products_per_behavioral_profile):
            product_description = []
            for i in range(terms_per_profile):
                word_index = random.randint(0, len(genre_terms_list[g]) - 1)
                product_description.append(genre_terms_list[g][word_index])

            pid += 1
            product_descriptions.append(
                Product(pid, ' '.join(product_description)))

        behavioral_profiles.append(BehavioralProfile(g, product_descriptions))

    Yamlator.dump("behavioral_profiles.yaml", behavioral_profiles)
Ejemplo n.º 4
0
def generate_profile_data(num_customer_profiles,
                            terms_per_profile,
                            min_products_purchased_per_customer,
                            max_products_purchased_per_customer,
                            num_products_per_behavioral_profile):
    """
    Generates customer and behavioral profile data.
    """

    #builds term/count map by genre
    genre_freq_ass_map=dict()
    print "building user product data set"
    for f in glob.glob("seed/*.txt"):
        genre = f.split("/")[1].split("-")[0]
        if genre not in genre_freq_ass_map:
            genre_freq_ass_map[genre]=FrequencyAssessor()

        with open (f, "r") as current_file:
            text = current_file.read()
            tokens = nltk.word_tokenize(text)
            for t in tokens:
                if t not in string.punctuation and len(t) > 2:
                    if t.endswith("."):
                        t=t[:len(t)-1]

                    genre_freq_ass_map[genre].update(t.lower())

    #build genre>list of terms freq of occurance weighted.
    genre_terms_list=dict()
    for g in genre_freq_ass_map.keys():
        genre_terms_list[g]=[]
        for t in genre_freq_ass_map[g].get_top_terms(max=500):
            for i in range(t[1]):
                genre_terms_list[g].append(t[0])

    #build customer profiles
    customer_profiles=[]
    for g in genre_terms_list.keys():
        cid=0
        for c in range(num_customer_profiles):
            product_descriptions=[]
            pid=0
            for p in range(random.randint(min_products_purchased_per_customer,
                                          max_products_purchased_per_customer)):
                product_description=[]
                for i in range(terms_per_profile):
                    word_index = random.randint(0,len(genre_terms_list[g])-1)
                    product_description.append(genre_terms_list[g][word_index])

                    z=0
                    while z<4:
                        #pop in some random words
                        ii = random.randint(0,len(genre_terms_list.keys())-1)
                        random_genre = genre_terms_list.keys()[ii]
                        word_index = random.randint(0,len(genre_terms_list[random_genre])-1)
                        product_description.append(genre_terms_list[random_genre][word_index])
                        z+=1
                pid+=1
                product_descriptions.append(Product(pid, ' '.join(product_description)))
            cid+=1
            customer_profiles.append(Customer(g, cid, product_descriptions))

    Yamlator.dump("customers.yaml", customer_profiles)

    #build behavioral_profiles
    behavioral_profiles=[]
    for g in genre_terms_list.keys():
        product_descriptions=[]
        pid=0
        for p in range(num_products_per_behavioral_profile):
            product_description=[]
            for i in range(terms_per_profile):
                word_index = random.randint(0,len(genre_terms_list[g])-1)
                product_description.append(genre_terms_list[g][word_index])

            pid+=1
            product_descriptions.append(Product(pid, ' '.join(product_description)))

        behavioral_profiles.append(BehavioralProfile(g, product_descriptions))

    Yamlator.dump("behavioral_profiles.yaml", behavioral_profiles)