def bagwords(doc, s):
    """creats a list of tokens without
       stopwords"""
    ddoc = extract_words(doc)
    ddocs = []
    for doc in s:
        ddocs.append(extract_words(doc))

    return ddoc, ddocs
def bagwords(doc, s):
    """creats a list of tokens without
       stopwords"""
    ddoc = extract_words(doc)
    ddocs = []
    for doc in s:
        ddocs.append(extract_words(doc))

    return ddoc, ddocs
Exemple #3
0
def ldaIt(doc, s, query):

    # for now only tags is under LDA filed #8
    model = "../ldas/models/" + query + ".lda"
    model = models.LdaModel.load(model)  # load existing model

    train = "../ldas/trainset/"
    id2word = corpora.Dictionary()  # decoding the model
    corpus = corpora.MalletCorpus(train + query)
    _ = id2word.merge_with(corpus.id2word)

    doc1 = extract_words(doc)
    doc_topic = id2word.doc2bow(doc1)  # extract topic for candidate
    doc_topics = model[doc_topic]
    docict = defaultdict(float)

    for topic, prob in doc_topics:  # dict candidate
        docict[topic] = float(prob)

    # add every time just the chosen and don't compute from scratch
    docs_topics = []

    for doc in s:
        query = id2word.doc2bow(extract_words(doc))  # extract topic for chosen
        docs_topics.append(model[query])
        

    # compute the similarity
    num_top = model.num_topics
    simi_vec = []

    # is not between 0-1 can go bigger since not normalized
    for a_doc in docs_topics:

        docdict = defaultdict(float)  # dict chosen
        for topic, prob in a_doc:
            docdict[topic] = float(prob)

        summ = 0
        for i in xrange(num_top):  # compute dist
            ll_doc = docdict[i]
            ll_cand = docict[i]
            summ+= (ll_doc - ll_cand)**2
        sim = np.sqrt(summ)

        simi_vec.append(sim)

    return simi_vec
def doc_it(a_doc):
    """"create relevance vector
        future use all fields from
        extract_words"""
    words = extract_words(a_doc)
    counter = Counter(words)
    vec = len(termdict) * [0]
    l_doc = len(words)
    for (word, count) in counter.iteritems():
        # normalize
        vec[termdict[word]] = count / l_doc
    # add img features
    # cn
    cn = a_doc["cn"] # is normalized
    cn = [float(i) for i in cn]
    vec.extend(cn)
    return np.array(vec)
def doc_it(a_doc):
    """"create relevance vector
        future use all fields from
        extract_words"""
    words = extract_words(a_doc)
    counter = Counter(words)
    vec = len(termdict) * [0]
    for (word, count) in counter.iteritems():
        vec[termdict[word]] = count
    # normalize by the single vector?
    # by the global highest value of cn?
    # add numerical features
    # cn
    cn = a_doc["cn"]
    cn = [float(i) for i in cn]
    vec.extend(cn)
    return np.array(vec)
def doc_it(a_doc):
    """"create relevance vector
        future use all fields from
        extract_words"""
    words = extract_words(a_doc)
    counter = Counter(words)
    vec = len(termdict) * [0]
    for (word, count) in counter.iteritems():
        vec[termdict[word]] = count
    # normalize by the single vector?
    # by the global highest value of cn?
    # add numerical features
    # cn
    cn = a_doc["cn"]
    cn = [float(i) for i in cn]
    vec.extend(cn)
    return np.array(vec)