Beispiel #1
0
)
from sqlalchemy import create_engine
import transaction
import tika_tools

def get_topic_distrib(text):
    """ 
    gets the topics distribution and the extracted text (from Tika) 
    on the form [(topicid, probability)] for P(topic) > epsilon 
    """
    import topics_tools
    lda = topics_tools.lda_model
    return lda[lda.id2word.doc2bow(topics_tools.parse(text))]

engine = create_engine('sqlite:///mosileno.sqlite')
DBSession.configure(bind=engine)

items = DBSession.query(Item).all()

for item in items:
    print item.id
    with transaction.manager:
        item = DBSession.query(Item).get(item.id)
        data = tika_tools.tika(item.link)
        if data is None:
            continue
        tops = get_topic_distrib(data)
        for (topic, score) in tops:
            d = ItemTopic(item.id, topic, score)
            DBSession.add(d)
Beispiel #2
0
import transaction
import tika_tools


def get_topic_distrib(text):
    """ 
    gets the topics distribution and the extracted text (from Tika) 
    on the form [(topicid, probability)] for P(topic) > epsilon 
    """
    import topics_tools
    lda = topics_tools.lda_model
    return lda[lda.id2word.doc2bow(topics_tools.parse(text))]


engine = create_engine('sqlite:///mosileno.sqlite')
DBSession.configure(bind=engine)

items = DBSession.query(Item).all()

for item in items:
    print item.id
    with transaction.manager:
        item = DBSession.query(Item).get(item.id)
        data = tika_tools.tika(item.link)
        if data is None:
            continue
        tops = get_topic_distrib(data)
        for (topic, score) in tops:
            d = ItemTopic(item.id, topic, score)
            DBSession.add(d)
Beispiel #3
0
import transaction


def get_most_relevant_topics(topics_list):
    """
    from a [(topicid, probability)] list (for P(topic) > epsilon),
    it gets the "names" of the most probable topics
    """
    topics_list.sort(cmp=lambda x, y: 1 if x[1] < y[1] else -1)
    topics_list = topics_list[:3]  # ARBITRARY (at most 3 topic names)
    topics_id, _ = zip(*topics_list)
    return [topics_tools.lda_topic_names[tid] for tid in topics_id]


engine = create_engine("sqlite:///mosileno.sqlite")
DBSession.configure(bind=engine)

items = DBSession.query(Item).all()

for item in items:
    with transaction.manager:
        print item.id
        res = DBSession.query(ItemTopic).filter_by(item=item.id).all()
        if res:
            topics = get_most_relevant_topics([(r.topic, r.weight) for r in res])
        else:
            topics = []
        for t in topics:
            tn = ItemTopicName(item.id, t)
            DBSession.add(tn)
Beispiel #4
0
import transaction


def get_most_relevant_topics(topics_list):
    """
    from a [(topicid, probability)] list (for P(topic) > epsilon),
    it gets the "names" of the most probable topics
    """
    topics_list.sort(cmp=lambda x, y: 1 if x[1] < y[1] else -1)
    topics_list = topics_list[:3]  # ARBITRARY (at most 3 topic names)
    topics_id, _ = zip(*topics_list)
    return [topics_tools.lda_topic_names[tid] for tid in topics_id]


engine = create_engine('sqlite:///mosileno.sqlite')
DBSession.configure(bind=engine)

items = DBSession.query(Item).all()

for item in items:
    with transaction.manager:
        print item.id
        res = DBSession.query(ItemTopic).filter_by(item=item.id).all()
        if res:
            topics = get_most_relevant_topics([(r.topic, r.weight)
                                               for r in res])
        else:
            topics = []
        for t in topics:
            tn = ItemTopicName(item.id, t)
            DBSession.add(tn)