def create_model_main(users, model_id, is_normalise=True): """ creating markov chain model for users text """ mc = markov_chain(model_id, booster) for m_user in users: timeline_text = tools.flush(m_user.timeline, lambda x:x['text']) for timeline_text_el in timeline_text: message = get_words(timeline_text_el, is_normalise=is_normalise) mc.add_message(message) mc.save() return mc
def create_model(user, is_normalise=True, mc=None): """ creating model for one user """ if not mc: mc = markov_chain(user.name_, booster) timeline_text = tools.flush(user.timeline, lambda x:x['text']) for tt_el in timeline_text: mc.add_message(get_words(tt_el), is_normalise) mc.save() return mc
def clust(models): out = [] for mc in models: t1 = time.time() nearest = max([{el: diff_markov_chains(mc.model_id_, el.model_id_, booster)} for el in models if el != mc], key=lambda x: x.values()[0]) nearest.keys()[0].print_me() print nearest.values()[0] new_mc_id = booster.sum_models(mc.model_id_, nearest.keys()[0].model_id_) new_mc = markov_chain(new_mc_id, booster) out.append(new_mc) t2 = time.time() print 'time: ', t2 - t1 return clust(out)
def big_differences(): log.info('extract messages') users = main_db.get_not_loaded_users() model_main = markov_chain('main', booster) result = [] log.info('---------users to find is %s-------------------------------' % len(users)) loaded_users = [] for user in users: log.info('load user %s' % user) loaded_user = engine.scrap(user, neighbourhood=0) if not loaded_user: continue main_db.set_user_loaded(user) model_main = create_model(loaded_user, mc=model_main) create_model(loaded_user) loaded_users.append(loaded_user) log.info('---------start process differences of models--------------') for user in loaded_users: model_current = markov_chain.create(user.name_, booster) diff_element = diff_markov_chains(model_main, model_current) result.append({'name': user.name_, 'x': diff_element['content'], 'y': user.timeline_count}) log.info('create difference... %s' % diff_element['content']) diff_main = diff_markov_chains(model_main, model_main) nodes, edges = model_main.get_unique_nodes_edges() model_diffs = [ {'x': diff_main['content'], 'y': float(edges) / nodes}, ] vis.visualise(result, header='diff and tweets count', x_title='difference between this and main', y_title='count tweets', spec_symbols=model_diffs) model_main.visualise(100)
def process_names(file_name, class_name): """ get from file ser names, scrapping saving and forming markov chains for any user timeline """ names = open(file_name).readlines() result = [] for name in names: name = tools.imply_dog(name, with_dog=True).strip() log.info("start processing name %s" % name) user = api_engine.scrap(name) db_.set_class(name, class_name) mc = markov_chain(name, booster) messages = [] for t_el in user.timeline: log.debug('>>>>%s' % t_el) if t_el: mc.add_message(model_splitter(t_el['text'])) mc.save() result.append(mc) return result