Esempio n. 1
0
def index_queries_by_levels(data, ES_CONN, index_name, QUERIES_DOC_TYPE, fields_enc_level, file_name, create_query_func, run_logs):
    queries = []
    count = 0
    run_logs.insert_log('going to start indexing {0} queries'.format(index_name))
    if file_name:
        for row in get_csv_data(data, file_name):
            queries.append(create_query_func(row, fields_enc_level))
            if len(queries) == 1000:
                response = push_queries_to_percolator(ES_CONN, index_name, QUERIES_DOC_TYPE, queries, run_logs)
                while not response:
                    run_logs.insert_log('indexing error.. going to index again in 5 sec from {0}'.format(file_name))
                    time.sleep(5)
                    response = push_queries_to_percolator(ES_CONN, index_name, QUERIES_DOC_TYPE, queries, run_logs)
                count += 1000
                run_logs.insert_log('sucessfully indexed {0} from {1} queries ...'.format(count, file_name))
                queries = []
    # else:
    #     file_name="DATE"
    #     queries=create_date_querys()

    if len(queries) > 0:
        response = push_queries_to_percolator(ES_CONN, index_name, QUERIES_DOC_TYPE, queries, run_logs)
        while not response:
            run_logs.insert_log('indexing error.. going to index again in 5 from {0}'.format(file_name))
            time.sleep(5)
            response = push_queries_to_percolator(ES_CONN, index_name, QUERIES_DOC_TYPE, queries, run_logs)

    run_logs.insert_log('Sucessfully indexed {0} queries from {1}'.format(count + len(queries), file_name))
Esempio n. 2
0
def main():
    input_filename = "./data.csv"
    output_filename = "validation_logs.txt"
    sys.stdout = open(
        output_filename,
        'w')  # hacky way to redirect print statements to output file

    records = get_csv_data(input_filename)

    validate_records(records)
Esempio n. 3
0
def send(params):
    data = get_csv_data()
    for user in data:
        user_id = str(user['id'])
        params_id = str(params['id'])
        if user_id == params_id:
            for key in user.keys():
                user[key] = params[key]
            break
    keys = data[0].keys()
    with open('./data.csv', 'w', newline='') as csv_file:
        writer = csv.DictWriter(csv_file, keys)
        writer.writeheader()
        writer.writerows(data)
    send_mail(params)
Esempio n. 4
0
import sys
sys.path.append('C:\\Users\\qian\\Desktop\\homework')
import pytest
from utils import get_csv_data, get_csv_loginname
from all_api.topics import Topics

test_data_csv = get_csv_data('cases/data.csv')
#print(test_data_csv)
test_create_csv = get_csv_data('cases/create.csv')
test_update_csv = get_csv_data('cases/update.csv')
test_loginname_csv = get_csv_loginname('cases/loginname.csv')
test_first_comment_csv = get_csv_loginname('cases/firstcomment.csv')


@pytest.fixture
def get_accesstoken():
    #r = request.get('http://39.107.96.138:3000/api/v1',auth = ('user','pass'))
    # res = r.json()
    #accesstoken = res['accesstoken']
    accesstoken = 'd38a77f0-6f29-45cd-8d49-f72f15b98fd2'
    return accesstoken


#@pytest.mark.skip(reason = '')
@pytest.fixture
def get_topic_id(get_accesstoken):
    create_url = '/topics'
    topics = Topics(create_url)
    r = topics.post_create_topics(
        title='11111111111111111111111111111',
        tab='ask',
Esempio n. 5
0
"""isblank.py: does a CSV file contain empty values.

This script is used to check whether any of the values in a given column of a
CSV file are blank. This is useful when working with translations to see
whether there are any that have been missed.

"""
import sys

from utils import get_csv_data


if __name__ == '__main__':
    if len(sys.argv) != 3:
        print ""
        print "usage: isblank.py <file> <column name>"
        sys.exit(1)

    file_path = sys.argv[1]
    column_name = sys.argv[2]

    table = get_csv_data(file_path)

    for idx, row in enumerate(table):
        if idx == 0:
            column = row.index(column_name)
        else:
            if not row[column].strip():
                print "Row %d is blank"
Esempio n. 6
0
def simulation():
    # Assumptions
    preferences_csv_file = 'preferences.csv'
    t_matrix_csv_file = 'transition_matrix.csv'
    num_users = 100000
    t_matrix_start_value = .9
    num_observations = 3
    mean_user_preference = 5
    sigma_user_preference = 3
    sites_to_destroy = ['KickassTorrents', 'Torrentz']

    # Parse CSV
    data, columns = get_csv_data(preferences_csv_file)
    hidden_states = columns['Name']
    observables = data[0][1:]
    categories = list(columns.keys())
    categories.remove('Name')

    site_scores = {}
    for row in data[1:]:
        site_scores[row[0]] = [int(x) for x in row[1:]]

    users = generate_users(categories, num_users, mean_user_preference,
                           sigma_user_preference)

    # Probability of each user starting in each site
    # Each array is a user, the values are the sites (in order from the csv)
    state_pi_map = {k: 0 for k in hidden_states}
    pi_list = []
    for user in users:
        user_sim_prob = similarity_probabilities_with_user(
            user.preferences, site_scores)
        for k, v in user_sim_prob.items():
            state_pi_map[k] += v
        pi_list.append(user_sim_prob)
    state_pi_map = {k: v / num_users for k, v in state_pi_map.items()}
    print('Probability of a user starting at a site:')
    print(state_pi_map)

    # Transition matrix
    transition_matrix = {}
    data_t_matrix, columns_t_matrix = get_csv_data(t_matrix_csv_file)
    data_t_matrix = data_t_matrix[1:]
    # Set initial values from csv
    for row in data_t_matrix:
        transition_matrix[row[0]] = dict(
            zip(hidden_states,
                normalize([int(x) for x in row[1:]],
                          1 - t_matrix_start_value)))
    # Destroy sites and set self values (i.e. Netflix -> Netflix, Primewire -> Primewire)
    for key, value in transition_matrix.items():
        value[key] = t_matrix_start_value
        for site_to_destroy in sites_to_destroy:
            value[site_to_destroy] = 0
    # Normalize again to account for changes
    for key, value in transition_matrix.items():
        transition_matrix[key] = dict(
            zip(hidden_states, normalize(value.values())))

    # Populate emission matrix for each user
    emission_matrix_list = []
    for user in users:
        emission_matrix = {}
        for h_state in hidden_states:
            emission_matrix[h_state] = {}
            for i, observable in enumerate(observables):
                emission_matrix[h_state][observable] = user.preferences[i] / 10
        emission_matrix_list.append(emission_matrix)

    observations_over_time_list = []
    for user in users:
        observations_over_time_list.append(
            tuple(select_preferences(categories, user, num_observations)))

    freq_hidden_states = {k: 0 for k in hidden_states}
    for i, user in enumerate(users):
        path, max_prob = viterbi(observations_over_time_list[i],
                                 tuple(hidden_states), pi_list[i],
                                 transition_matrix, emission_matrix_list[i])
        freq_hidden_states[path[-1]] += 1
    print('After destroying the sites:')
    print({k: v / num_users for k, v in freq_hidden_states.items()})
Esempio n. 7
0
import sys

from utils import get_csv_data
from csvkit import CSVKitWriter


if __name__ == '__main__':
    if len(sys.argv) != 3:
        print ""
        print "usage: create_group.py <file> <file>"
        sys.exit(1)

    filein = sys.argv[1]
    fileout = sys.argv[2]

    table = get_csv_data(filein)

    genera = set()
    ordered = [['genus']]

    for idx, row in enumerate(table):
        if idx == 0:
            column = row.index('scientific_name')
        else:
            genus = row[column].split()[0]
            if genus not in genera:
                genera.add(genus)
                ordered.append([genus])

    with open(fileout, 'wb') as fp:
        writer = CSVKitWriter(fp)
Esempio n. 8
0
import sys

from utils import get_csv_data, get_column_values


if __name__ == '__main__':
    if len(sys.argv) != 4:
        print ""
        print "usage: issubset.py <file> <file> <column name>"
        sys.exit(1)

    subset_path = sys.argv[1]
    superset_path = sys.argv[2]
    column_name = sys.argv[3]

    superset_table = get_csv_data(superset_path)
    subset_table = get_csv_data(subset_path)

    superset = get_column_values(column_name, superset_table)
    subset = get_column_values(column_name, subset_table)

    print "The first file contains %d distinct values" % len(subset_table)
    print "The second file contains %d distinct values" % len(superset_table)

    missing = []

    for item in subset:
        if item not in superset:
            missing.append(item)

    if missing:
Esempio n. 9
0
def get_users():
    return get_csv_data()