def create_trees_cumm(docs, weights_):
    words, weights_trees, root_inds = [], [], []
    for i, doc in enumerate(docs):
#        print doc
        doc_ = flatten_tree(doc)
        words_, weights_tree_, root_ind_ = create_trees(doc_, weights_)
        words.append(words_)
        weights_trees.append(weights_tree_)
        root_inds.append(root_ind_)
        _helpScripts.print_perc(float32(i)/float32(len(docs)) * 100 + 1)
    return words, weights_trees, root_inds
def create_trees_cumm(docs, weights_):
    words, weights_trees, root_inds = [], [], []
    for i, doc in enumerate(docs):
        #        print doc
        doc_ = flatten_tree(doc)
        words_, weights_tree_, root_ind_ = create_trees(doc_, weights_)
        words.append(words_)
        weights_trees.append(weights_tree_)
        root_inds.append(root_ind_)
        _helpScripts.print_perc(float32(i) / float32(len(docs)) * 100 + 1)
    return words, weights_trees, root_inds
Beispiel #3
0
def generate_sents(art_dir, sent_count=None, pos_count=1, neg_count=3,
                   rng=np.random.RandomState(156), save_dir=None):
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)
    art_files = glob.glob(art_dir + '*.txt')
    sents_true = []
    sents_false = []
    for ii, art_file in enumerate(art_files):
        sents_true_, sents_false_ = generate_sent(art_file, pos_count=pos_count, neg_count=neg_count,
                                                  rng=np.random.RandomState(156), save_dir=save_dir)
        sents_true.extend([sents_true_])
        sents_false.extend([sents_false_])
        _helpScripts.print_perc(float(ii) / float(len(art_files)) * 100 + 1,
                                suffix='performed {} from {}'.format(ii, len(art_files)))
    return sents_true, sents_false
def select_themes(wh_scores, wh_themes, themes_quest=ARTICLES_QUEST):
    """
    returns the themes according to the ranking
    firstly wil be the best themes for each question returned and then according to the summ scores of the theme
    """
    quest_n = len(wh_themes)
    themes = np.zeros((quest_n, themes_quest), dtype=object)
    for q in range(quest_n):
        q_themes_best = set(wh_themes[q][:, 0])
        themes[q, :len(q_themes_best)] = np.array(list(q_themes_best))
        q_themes_rest = np.array(list(set(wh_themes[q].flatten()) - q_themes_best), dtype=object)
        th_scores = np.array([np.sum(wh_scores[q][np.where(wh_themes[q] == q_themes_rest[i])]) for i in range(len(q_themes_rest))])
        th_scores_sorted = np.argsort(th_scores)[::-1]
        themes[q, len(q_themes_best):themes_quest] = q_themes_rest[th_scores_sorted[0:themes_quest-len(q_themes_best)]]
        _helpScripts.print_perc(float32(q)/float32(quest_n) * 100 + 1)
    return themes
def generate_sents(art_dir, sent_count=None, pos_count=1, neg_count=3, rng=np.random.RandomState(156), save_dir=None):
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)
    art_files = glob.glob(art_dir + "*.txt")
    sents_true = []
    sents_false = []
    for ii, art_file in enumerate(art_files):
        sents_true_, sents_false_ = generate_sent(
            art_file, pos_count=pos_count, neg_count=neg_count, rng=np.random.RandomState(156), save_dir=save_dir
        )
        sents_true.extend([sents_true_])
        sents_false.extend([sents_false_])
        _helpScripts.print_perc(
            float(ii) / float(len(art_files)) * 100 + 1, suffix="performed {} from {}".format(ii, len(art_files))
        )
    return sents_true, sents_false
def get_art_word_ind(sentences, vocab, sent_art, sent_l, pad_token_ind):
    """
    generates word indexies for articles
    """
    art_count = len(sentences)
    article_inds = np.ones((art_count,
                            sent_art, sent_l), dtype='int16') * pad_token_ind
                                 
    for art_ind in range(art_count):
        article = sentences[art_ind]
        try:
            sent_ind = sents2ind(article, vocab)
        except IndexError:
            continue
        article_inds[art_ind, :sent_ind.shape[0], :sent_ind.shape[1]] = sent_ind[:, :]                            

        _helpScripts.print_perc(float32(art_ind)/float32(art_count) * 100 + 1,
                                suffix='{} articles prceeded from {}'.format(art_ind, art_count))
    return article_inds
Beispiel #7
0
def get_art_word_ind(sentences, vocab, sent_art, sent_l, pad_token_ind):
    """
    generates word indexies for articles
    """
    art_count = len(sentences)
    article_inds = np.ones(
        (art_count, sent_art, sent_l), dtype='int16') * pad_token_ind

    for art_ind in range(art_count):
        article = sentences[art_ind]
        try:
            sent_ind = sents2ind(article, vocab)
        except IndexError:
            continue
        article_inds[
            art_ind, :sent_ind.shape[0], :sent_ind.shape[1]] = sent_ind[:, :]

        _helpScripts.print_perc(
            float32(art_ind) / float32(art_count) * 100 + 1,
            suffix='{} articles prceeded from {}'.format(art_ind, art_count))
    return article_inds
Beispiel #8
0
def select_themes(wh_scores, wh_themes, themes_quest=ARTICLES_QUEST):
    """
    returns the themes according to the ranking
    firstly wil be the best themes for each question returned and then according to the summ scores of the theme
    """
    quest_n = len(wh_themes)
    themes = np.zeros((quest_n, themes_quest), dtype=object)
    for q in range(quest_n):
        q_themes_best = set(wh_themes[q][:, 0])
        themes[q, :len(q_themes_best)] = np.array(list(q_themes_best))
        q_themes_rest = np.array(
            list(set(wh_themes[q].flatten()) - q_themes_best), dtype=object)
        th_scores = np.array([
            np.sum(wh_scores[q][np.where(wh_themes[q] == q_themes_rest[i])])
            for i in range(len(q_themes_rest))
        ])
        th_scores_sorted = np.argsort(th_scores)[::-1]
        themes[q, len(q_themes_best):themes_quest] = q_themes_rest[
            th_scores_sorted[0:themes_quest - len(q_themes_best)]]
        _helpScripts.print_perc(float32(q) / float32(quest_n) * 100 + 1)
    return themes