def bagwords(doc, s): """creats a list of tokens without stopwords""" ddoc = extract_words(doc) ddocs = [] for doc in s: ddocs.append(extract_words(doc)) return ddoc, ddocs
def ldaIt(doc, s, query): # for now only tags is under LDA filed #8 model = "../ldas/models/" + query + ".lda" model = models.LdaModel.load(model) # load existing model train = "../ldas/trainset/" id2word = corpora.Dictionary() # decoding the model corpus = corpora.MalletCorpus(train + query) _ = id2word.merge_with(corpus.id2word) doc1 = extract_words(doc) doc_topic = id2word.doc2bow(doc1) # extract topic for candidate doc_topics = model[doc_topic] docict = defaultdict(float) for topic, prob in doc_topics: # dict candidate docict[topic] = float(prob) # add every time just the chosen and don't compute from scratch docs_topics = [] for doc in s: query = id2word.doc2bow(extract_words(doc)) # extract topic for chosen docs_topics.append(model[query]) # compute the similarity num_top = model.num_topics simi_vec = [] # is not between 0-1 can go bigger since not normalized for a_doc in docs_topics: docdict = defaultdict(float) # dict chosen for topic, prob in a_doc: docdict[topic] = float(prob) summ = 0 for i in xrange(num_top): # compute dist ll_doc = docdict[i] ll_cand = docict[i] summ+= (ll_doc - ll_cand)**2 sim = np.sqrt(summ) simi_vec.append(sim) return simi_vec
def doc_it(a_doc): """"create relevance vector future use all fields from extract_words""" words = extract_words(a_doc) counter = Counter(words) vec = len(termdict) * [0] l_doc = len(words) for (word, count) in counter.iteritems(): # normalize vec[termdict[word]] = count / l_doc # add img features # cn cn = a_doc["cn"] # is normalized cn = [float(i) for i in cn] vec.extend(cn) return np.array(vec)
def doc_it(a_doc): """"create relevance vector future use all fields from extract_words""" words = extract_words(a_doc) counter = Counter(words) vec = len(termdict) * [0] for (word, count) in counter.iteritems(): vec[termdict[word]] = count # normalize by the single vector? # by the global highest value of cn? # add numerical features # cn cn = a_doc["cn"] cn = [float(i) for i in cn] vec.extend(cn) return np.array(vec)