mix_sources = [ 'amazon+twitter', 'amazon+ebay', 'twitter+ebay', 'amazon+twitter+ebay' ] tree_category = 'electronics' combos = [ ('amazon', ['twitter', 'ebay'], tree_category), ('twitter', ['amazon', 'ebay'], tree_category), ('ebay', ['amazon', 'twitter'], tree_category), #new mixed traning set test #('amazon+twitter', ['amazon','twitter', 'ebay'], tree_category), #('amazon+ebay', ['amazon','twitter', 'ebay'], tree_category), #('twitter+ebay', ['amazon','twitter', 'ebay'], tree_category), ('amazon+twitter+ebay', ['amazon', 'twitter', 'ebay'], tree_category), ] corpus, y = util.read_from_sources(all_sources, tree_category=tree_category, max_examples=int(options.max_examples)) corpus = util.stem_corpus( corpus) if options.use_stemming is True else corpus #generate combined corpus and y, for mixed traning set test. for mix in mix_sources: sourceList = mix.split('+') newY = [] newCorpus = [] for source in sourceList: newY = newY + y[source] newCorpus = newCorpus + corpus[source] y[mix] = newY
from gensim import corpora, models, similarities import util all_sources = ['amazon'] tree_category = 'books' corpus, y = util.read_from_sources(all_sources, tree_category = tree_category, max_examples = 5000) reviews = corpus[all_sources[0]] cleaned_reviews = [util.clean_sentence(review.encode('ascii', 'ignore')) for review in reviews] # gensim expects each example to be a list of words, instead of a long string texts = [[word for word in r.split()] for r in cleaned_reviews] dictionary = corpora.Dictionary(texts) corpus_bow = [dictionary.doc2bow(text) for text in texts] tfidf = models.TfidfModel(corpus_bow) corpus_tfidf = tfidf[corpus_bow] # don't use this - HPD with TFIDF with crash your computer # model = models.hdpmodel.HdpModel(corpus_bow, id2word=dictionary, T = 20) model = models.ldamodel.LdaModel(corpus_tfidf, id2word=dictionary, num_topics = 20) topic_distributions = list(model[corpus_tfidf]) # print_topic(topicid, topn=10)¶ model.print_topic(10)
mix_sources = ['amazon+twitter', 'amazon+ebay', 'twitter+ebay', 'amazon+twitter+ebay'] tree_category = 'videogames' combos = [('amazon', ['twitter', 'ebay'], tree_category), ('twitter', ['amazon', 'ebay'], tree_category), ('ebay', ['amazon', 'twitter'], tree_category),] """ #new mixed traning set test ('amazon+twitter', ['amazon','twitter', 'ebay'], tree_category), ('amazon+ebay', ['amazon','twitter', 'ebay'], tree_category), ('twitter+ebay', ['amazon','twitter', 'ebay'], tree_category), ('amazon+twitter+ebay', ['amazon','twitter', 'ebay'], tree_category), ] """ corpus, y = util.read_from_sources(all_sources, tree_category = tree_category, max_examples = int(options.max_examples)) corpus = util.stem_corpus(corpus) if options.use_stemming is True else corpus for master_source, external_sources, tree_category in combos: learn_cross_domain(master_source, external_sources, corpus, y, tree_category) #generate combined corpus and y, for mixed traning set test. """ for mix in mix_sources: sourceList = mix.split('+') newY = [] newCorpus = [] for source in sourceList: newY = newY + y[source] newCorpus = newCorpus + corpus[source] y[mix] = newY