def create_trees_cumm(docs, weights_): words, weights_trees, root_inds = [], [], [] for i, doc in enumerate(docs): # print doc doc_ = flatten_tree(doc) words_, weights_tree_, root_ind_ = create_trees(doc_, weights_) words.append(words_) weights_trees.append(weights_tree_) root_inds.append(root_ind_) _helpScripts.print_perc(float32(i)/float32(len(docs)) * 100 + 1) return words, weights_trees, root_inds
def create_trees_cumm(docs, weights_): words, weights_trees, root_inds = [], [], [] for i, doc in enumerate(docs): # print doc doc_ = flatten_tree(doc) words_, weights_tree_, root_ind_ = create_trees(doc_, weights_) words.append(words_) weights_trees.append(weights_tree_) root_inds.append(root_ind_) _helpScripts.print_perc(float32(i) / float32(len(docs)) * 100 + 1) return words, weights_trees, root_inds
def generate_sents(art_dir, sent_count=None, pos_count=1, neg_count=3, rng=np.random.RandomState(156), save_dir=None): if not os.path.isdir(save_dir): os.makedirs(save_dir) art_files = glob.glob(art_dir + '*.txt') sents_true = [] sents_false = [] for ii, art_file in enumerate(art_files): sents_true_, sents_false_ = generate_sent(art_file, pos_count=pos_count, neg_count=neg_count, rng=np.random.RandomState(156), save_dir=save_dir) sents_true.extend([sents_true_]) sents_false.extend([sents_false_]) _helpScripts.print_perc(float(ii) / float(len(art_files)) * 100 + 1, suffix='performed {} from {}'.format(ii, len(art_files))) return sents_true, sents_false
def select_themes(wh_scores, wh_themes, themes_quest=ARTICLES_QUEST): """ returns the themes according to the ranking firstly wil be the best themes for each question returned and then according to the summ scores of the theme """ quest_n = len(wh_themes) themes = np.zeros((quest_n, themes_quest), dtype=object) for q in range(quest_n): q_themes_best = set(wh_themes[q][:, 0]) themes[q, :len(q_themes_best)] = np.array(list(q_themes_best)) q_themes_rest = np.array(list(set(wh_themes[q].flatten()) - q_themes_best), dtype=object) th_scores = np.array([np.sum(wh_scores[q][np.where(wh_themes[q] == q_themes_rest[i])]) for i in range(len(q_themes_rest))]) th_scores_sorted = np.argsort(th_scores)[::-1] themes[q, len(q_themes_best):themes_quest] = q_themes_rest[th_scores_sorted[0:themes_quest-len(q_themes_best)]] _helpScripts.print_perc(float32(q)/float32(quest_n) * 100 + 1) return themes
def generate_sents(art_dir, sent_count=None, pos_count=1, neg_count=3, rng=np.random.RandomState(156), save_dir=None): if not os.path.isdir(save_dir): os.makedirs(save_dir) art_files = glob.glob(art_dir + "*.txt") sents_true = [] sents_false = [] for ii, art_file in enumerate(art_files): sents_true_, sents_false_ = generate_sent( art_file, pos_count=pos_count, neg_count=neg_count, rng=np.random.RandomState(156), save_dir=save_dir ) sents_true.extend([sents_true_]) sents_false.extend([sents_false_]) _helpScripts.print_perc( float(ii) / float(len(art_files)) * 100 + 1, suffix="performed {} from {}".format(ii, len(art_files)) ) return sents_true, sents_false
def get_art_word_ind(sentences, vocab, sent_art, sent_l, pad_token_ind): """ generates word indexies for articles """ art_count = len(sentences) article_inds = np.ones((art_count, sent_art, sent_l), dtype='int16') * pad_token_ind for art_ind in range(art_count): article = sentences[art_ind] try: sent_ind = sents2ind(article, vocab) except IndexError: continue article_inds[art_ind, :sent_ind.shape[0], :sent_ind.shape[1]] = sent_ind[:, :] _helpScripts.print_perc(float32(art_ind)/float32(art_count) * 100 + 1, suffix='{} articles prceeded from {}'.format(art_ind, art_count)) return article_inds
def get_art_word_ind(sentences, vocab, sent_art, sent_l, pad_token_ind): """ generates word indexies for articles """ art_count = len(sentences) article_inds = np.ones( (art_count, sent_art, sent_l), dtype='int16') * pad_token_ind for art_ind in range(art_count): article = sentences[art_ind] try: sent_ind = sents2ind(article, vocab) except IndexError: continue article_inds[ art_ind, :sent_ind.shape[0], :sent_ind.shape[1]] = sent_ind[:, :] _helpScripts.print_perc( float32(art_ind) / float32(art_count) * 100 + 1, suffix='{} articles prceeded from {}'.format(art_ind, art_count)) return article_inds
def select_themes(wh_scores, wh_themes, themes_quest=ARTICLES_QUEST): """ returns the themes according to the ranking firstly wil be the best themes for each question returned and then according to the summ scores of the theme """ quest_n = len(wh_themes) themes = np.zeros((quest_n, themes_quest), dtype=object) for q in range(quest_n): q_themes_best = set(wh_themes[q][:, 0]) themes[q, :len(q_themes_best)] = np.array(list(q_themes_best)) q_themes_rest = np.array( list(set(wh_themes[q].flatten()) - q_themes_best), dtype=object) th_scores = np.array([ np.sum(wh_scores[q][np.where(wh_themes[q] == q_themes_rest[i])]) for i in range(len(q_themes_rest)) ]) th_scores_sorted = np.argsort(th_scores)[::-1] themes[q, len(q_themes_best):themes_quest] = q_themes_rest[ th_scores_sorted[0:themes_quest - len(q_themes_best)]] _helpScripts.print_perc(float32(q) / float32(quest_n) * 100 + 1) return themes