Example #1
0
 def __init__(self, root):
     import config
     root, data_path, model_path, vector_path = config.get_paths()
     self.root = root
     self.data_path = data_path
     self.model_path = model_path
     self.vector_path = vector_path
     self.all_hashtags = [
         "voetbal", "moslim", "werk", "economie", "jihad", "seks",
         "politiek"
     ]  #
     self.all_vectors_store = pd.HDFStore(self.root + "w2v_vector.h5")
     self.balanced_store = pd.HDFStore(self.root +
                                       "datasets/seeds/balanced.h5")
     self.tweets = pd.read_csv(self.root + "datasets/data_sample.csv")
def partition_and_generate_distributions(index_name: str):
    configuration = config.get_paths()
    ix = index.open_dir(configuration[index_name], readonly=True)
    LOGGER.info('Index path: ' + configuration[index_name])
    with ix.reader() as ix_reader:
        pa = pt.Partitioner(ix, ix_reader)
        print('Partitioner initiated!')
        parts = pa.generate([0.98, 0.1])
        parts = [p for p in parts]
        print('Parts created!')
        print('naive1 ({}, {})'.format(parts[0].name, parts[1].name))
        sol.generate_distance_distributions(
            cache=parts[0],
            disk=parts[1],
            save_path='/data/khodadaa/index/data',
            distance_type=['kld', 'avg-kld'])
Example #3
0
# test if the amount of tokens has an influence on the nn performance
# we saw that it doesnt work well when all tweet of varying amount of tokens is used.

import pandas as pd
import numpy as np
import os, sys
import features
import config
root, data_path, model_path, vector_path = config.get_paths()
print root

import pandas as pd

import dataset
dset = dataset.Dataset(root)
# dset.create_subject_sets()

## Train a neural network on amount of tokens individually
vectors = dset.all_vectors_store["data"]
voetbal = pd.read_csv(root + "datasets/seeds/voetbal.csv")
voetbal = pd.merge(voetbal, vectors, on="id")
voetbal["labels"] = 0
jihad = pd.read_csv(root + "datasets/seeds/jihad.csv")
jihad = pd.merge(jihad, vectors, on="id")
jihad["labels"] = 1

#determine tokens
rm_list = ["<stopword>", "<mention>", "<url>", "rt"]
voetbal["ntokens"] = voetbal.filtered_text.apply(
    lambda x: len([a for a in x.split() if a not in rm_list]))
jihad["ntokens"] = jihad.filtered_text.apply(
Example #4
0
from time import sleep

from config import get_paths
from models import DDQNLearner, DDQNPlayer
from utils import make_atari, wrap_deepmind, parse_args
from utils import Logger, Plotter

args = parse_args()
# for arg in vars(args):
# 	print(arg, getattr(args, arg))

ENV_NAME = args.env_name
ENV_VER = args.env_version
ENV_GYM = ENV_NAME + ENV_VER

save_dirs = get_paths(drive=args.drive_save, env_name=ENV_NAME)

PRINT_FREQ_EP = args.log_freq
SAVE_MODEL_FREQ = args.save_freq
LEARNING_START = args.learn_start

logger = Logger(save_dirs=save_dirs,
                log_types=[],
                log_freq=args.log_freq,
                mode=args.mode)
plotter = Plotter(save_dirs=save_dirs,
                  plot_types=[
                      'avg_scores_ep', 'avg_scores_ts', 'avg_scores_100_ep',
                      'avg_scores_100_ts', 'scores_ep', 'scores_ts',
                      'high_scores_ep', 'high_scores_ts', 'low_scores_ep',
                      'low_scores_ts', 'avg_loss_ep', 'avg_acc_ep',
Example #5
0
    with ix.searcher() as searcher:
        query = QueryParser(field_name, ix.schema).parse(user_query)
        facet = sorting.FieldFacet('count', reverse=True)
        results = searcher.search(query, sortedby=facet, limit=limit)
        print(results)
        for res in results:
            print('\n', res)
            if res.reader.has_vector(res.docnum, field_name):
                vgen = res.reader.vector_as('frequency', res.docnum,
                                            field_name)
                terms = [v for v in vgen]
                terms.sort(key=lambda tup: tup[1], reverse=True)
                print('Top terms: ', terms)
            else:
                print('0 term')


if __name__ == '__main__':
    index_name = 'wiki13_index'
    limit = None
    if len(sys.argv) > 1:
        index_name = sys.argv[1]
    if len(sys.argv) > 2:
        limit = sys.argv[2]
    configuration = config.get_paths()

    user_query = 'public policy NOT \"public policy\"'
    while user_query != ':q':
        search(user_query, limit, configuration[index_name])
        user_query = input('Query [:q to exit] : ')
            prob_d_condit_q = prob_q_condit_d / norm
            prob += prob_t_condit_d * prob_d_condit_q
        return prob

    clt = 0.0
    for t in vocabulary:
        if collection_tfs[t] == 0:
            collection_tfs[t] = 1
        prob_t_condit_D = collection_tfs[t] / collection_total_terms
        prob_t_condit_Dq = get_prob_t_condition_Dq(t)
        clt += prob_t_condit_Dq * log(prob_t_condit_Dq / prob_t_condit_D)
    return clt


if __name__ == '__main__':
    c = config.get_paths()
    index_path = c[sys.argv[1]]
    query_file_path = sys.argv[2]
    save_path = sys.argv[3]

    config.setup_logger('querydifficulty')

    ix = index.open_dir(index_path, readonly=True)
    LOGGER.info('Index path: ' + index_path)
    ix_reader = ix.reader()

    vocabulary = []
    db_tfs = defaultdict(int)
    db_total_terms = 0
    with open(c['db_tfs'], 'r') as fr:
        for line in fr: