Example #1
0
def get_click_user(log_base_dir, day, filters):
    log_dir_name = os.path.join(log_base_dir, day)
    print(log_dir_name)
    print(os.path.abspath(log_dir_name))
    print(filters)
    print(os.path.curdir)
    file_list = utility.get_file_list(log_dir_name, filters)
    print("number of log file is %d" % len(file_list))
    redis_host = config.get_value("redis_server", "host")
    redis_port = config.get_int_value("redis_server", "port")
    redis_password = config.get_value("redis_server", "password")
    db_index = config.get_int_value("redis_server", "push_id_index")
    user_id_index = config.get_int_value("rtb_log_index", "user_id")
    redis_client = connect_redis(host=redis_host,
                                 port=redis_port,
                                 db_index=db_index,
                                 password=redis_password)
    ad_click_user_map = dict()
    for file_name in file_list:
        parse_file(file_name,
                   redis_client,
                   ad_click_user_map,
                   index=user_id_index)

    return ad_click_user_map
Example #2
0
def parse_nsf(r, table):
    start_time = time.time()
    dict_in_name = r + '/words.txt'
    infile = open(dict_in_name, 'r')
    total_words = [w.strip() for w in infile.readlines()]
    #for line in infile.readlines():
    #    total_words.append(line.split()[1].strip())
    dict_out_name = table + '.dict'
    outfile = open(dict_out_name, 'w')
    outfile.write("'")
    outfile.write("','".join(total_words))
    outfile.write("'")
    infile.close()
    outfile.close()

    file_path_list, file_name_list = utility.get_file_list(r)

    madlib = open(table + '.madlib', 'w')
    madlib_out = {}
    google_out = {}
    R_out = {}
    for file_path in file_path_list:
        if file_path.endswith("docwords.txt"):
            infile = open(file_path)
            for line in infile.readlines():
                parts = line.split()
                if int(parts[1]) >= len(total_words): continue
                if parts[0] in madlib_out:
                    for i in range(0, int(parts[2])):
                        madlib_out[parts[0]] += ',' + parts[1]
                else:
                    madlib_out[parts[0]] = parts[1]
                if parts[0] in google_out:
                    google_out[parts[0]] += total_words[
                        int(parts[1]) - 1] + ' ' + parts[2] + ' '
                else:
                    google_out[parts[0]] = total_words[
                        int(parts[1]) - 1] + ' ' + parts[2] + ' '
                if parts[0] in R_out:
                    R_out[parts[0]] += parts[1] + ':' + parts[2] + ' '
                else:
                    R_out[parts[0]] = parts[1] + ':' + parts[2] + ' '

    j = 1
    for key, value in madlib_out.iteritems():
        madlib.write(str(j) + ":" + value + '\n')
        j = j + 1


#    for key, value in google_out.iteritems():
#        google.write(value + '\n')

#    for key, value in R_out.iteritems():
#        R.write( str(len(value.split(':'))+1) + ' ' + value + '\n')

    madlib.close()

    elapsed_time = time.time() - start_time
    print r, 'parsing time:', elapsed_time
Example #3
0
def dict_by_file(r, table, parse_method = None):
    file_path_list, file_name_list = utility.get_file_list(r)
    total_words = []
    for file_path in file_path_list:
        if file_path.startswith('all'):
            total_words += open(file_path, 'r').readlines()
    dict = utility.unique(total_words)
    utility.write_dict_file(dict, r.split('/')[-1] + '.dict')
    return dict
Example #4
0
def dict_by_text(r, table, parse_method = None):
    file_path_list, file_name_list = utility.get_file_list(r)
    total_words = []
    for file_path in file_path_list:
        lines = parse_method(file_path)
        total_words += utility.get_file_words(lines)
    dict = utility.unique(total_words)
    utility.write_dict_file(dict, table + '.dict')
    return dict
Example #5
0
def dict_by_text(r, table, parse_method=None):
    file_path_list, file_name_list = utility.get_file_list(r)
    total_words = []
    for file_path in file_path_list:
        lines = parse_method(file_path)
        total_words += utility.get_file_words(lines)
    dict = utility.unique(total_words)
    utility.write_dict_file(dict, table + '.dict')
    return dict
Example #6
0
def dict_by_file(r, table, parse_method=None):
    file_path_list, file_name_list = utility.get_file_list(r)
    total_words = []
    for file_path in file_path_list:
        if file_path.startswith('all'):
            total_words += open(file_path, 'r').readlines()
    dict = utility.unique(total_words)
    utility.write_dict_file(dict, r.split('/')[-1] + '.dict')
    return dict
Example #7
0
def parse_nsf(r, table):
    start_time = time.time()
    dict_in_name = r + '/words.txt'
    infile = open(dict_in_name, 'r')
    total_words = [w.strip() for w in infile.readlines()]
    #for line in infile.readlines():
    #    total_words.append(line.split()[1].strip())
    dict_out_name = table + '.dict'
    outfile = open(dict_out_name, 'w')
    outfile.write("'")
    outfile.write("','".join(total_words))
    outfile.write("'")
    infile.close()
    outfile.close()

    file_path_list, file_name_list = utility.get_file_list(r)

    madlib = open(table + '.madlib', 'w')
    madlib_out = {}
    google_out = {}
    R_out      = {}
    for file_path in file_path_list:
        if file_path.endswith("docwords.txt"):
            infile = open(file_path)
            for line in infile.readlines():
                parts = line.split()
                if int(parts[1]) >= len(total_words): continue
                if parts[0] in madlib_out:
                    for i in range(0, int(parts[2])):
                        madlib_out[parts[0]] += ',' + parts[1]
                else:
                    madlib_out[parts[0]] = parts[1]
                if parts[0] in google_out:
                    google_out[parts[0]] += total_words[int(parts[1]) - 1] + ' ' + parts[2] + ' '
                else:
                    google_out[parts[0]] = total_words[int(parts[1]) - 1] + ' ' + parts[2] + ' '
                if parts[0] in R_out:
                    R_out[parts[0]] += parts[1] + ':' + parts[2] + ' '
                else:
                    R_out[parts[0]] = parts[1] + ':' + parts[2] + ' '

    j = 1
    for key, value in madlib_out.iteritems():
        madlib.write(str(j) + ":" + value + '\n')
        j = j + 1

#    for key, value in google_out.iteritems():
#        google.write(value + '\n')

#    for key, value in R_out.iteritems():
#        R.write( str(len(value.split(':'))+1) + ' ' + value + '\n')
    
    madlib.close()

    elapsed_time = time.time() - start_time
    print r, 'parsing time:', elapsed_time
Example #8
0
def prepare_reuters21578(r):
    result = []
    total_topics = {}
    total_texts = {}
    #outfile = open(r.split('/')[-1]+'.topic','w')
    file_path_list, file_name_list = utility.get_file_list(r)
    for file_path in file_path_list:
        if file_path.endswith("sgm"):
            parser = utility.MyParser()
            parser.parse(open(file_path).read())
            total_topics.update(parser.get_topics())
            total_texts.update(parser.get_texts())
    #outfile.close()
    return total_topics, total_texts
Example #9
0
def prepare_reuters21578(r):
    result = []
    total_topics = {}
    total_texts = {}
    #outfile = open(r.split('/')[-1]+'.topic','w')
    file_path_list, file_name_list = utility.get_file_list(r)
    for file_path in file_path_list:
        if file_path.endswith("sgm"):
            parser = utility.MyParser()
            parser.parse(open(file_path).read())
            total_topics.update(parser.get_topics())
            total_texts.update(parser.get_texts())
    #outfile.close()
    return total_topics, total_texts
Example #10
0
def get_click_pos(log_base_dir, day, filters):
    log_dir_name = os.path.join(log_base_dir, day)
    print(log_dir_name)
    print(os.path.abspath(log_dir_name))
    print(filters)
    print(os.path.curdir)
    file_list = utility.get_file_list(log_dir_name, filters)
    print("number of log file is %d" % len(file_list))
    redis_client = connect_redis(host="127.0.0.1",
                                 port=63791,
                                 db_index=13,
                                 password="******")
    ad_click_pos_map = dict()
    for file_name in file_list:
        parse_file(file_name, redis_client, ad_click_pos_map, index=24)

    return ad_click_pos_map
Example #11
0
def extract_mod_paths_openmw(modlist):
    content = modlist.get('settings', 'content')
    content = content.split('\n')
    data_dirs = modlist.get('settings', 'data').split('\n')
    data_dirs = map(lambda x: x.strip('"'), data_dirs)
    data_dirs = map(lambda x: os.path.abspath(x), data_dirs)

    res = []
    for c in content:
        full_path = ''
        for d in data_dirs:
            flist = ut.get_file_list(d)
            flist = map(lambda x: os.path.basename(x), flist)
            if c in flist:
                full_path = os.path.join(d, c)
        if not os.path.exists(full_path):
            print 'Warning: could not find ' + full_path
        res.append(full_path)
    return res
Example #12
0
def process_mgefs():

    icon_dir = os.path.abspath('output/icons')
    if not os.path.exists(icon_dir):
        os.makedirs(icon_dir)

    idata.index_data['mgef_to_school'] = {}
    for name, mgef in esm.records_original['MGEF'].iteritems():
        mgef['icon_base'] = os.path.abspath(
            'data/icons/' + mgef['ITEX'][0:-1].lower().strip(NULL).replace(
                '.tga', '.dds').replace('\\', '/'))
        index = idata.get(name, 'magic_effects')
        mgef.update(schema.decode_subrecord(mgef['MEDT'], 'MEDT'))
        mgef['name'] = name
        idata.index_data['mgef_to_school'][index] = mgef['school']
        if mgef['school'] not in reference_icons:
            reference_icons[mgef['school']] = mgef['icon_base']

    input_dir = os.path.abspath('content/magic_effects')
    input_files = ut.get_file_list(input_dir)

    mgefs_new = {}
    for f in input_files:
        data = ut.read_newline_sep(f)
        for d in data:
            name = d[0]
            if name not in esm.records_original['MGEF']:
                continue
            else:
                mgefs_new[name] = esm.records_original['MGEF'][name]
            mgef = mgefs_new[name]
            school_old = mgef['school']
            if len(d) > 1:
                mgef.update(schema.decode_plaintext(d[1:], 'MEDT'))
            schema.encode_all_subrecords(mgef)

            if config.options.getboolean('settings', 'regenerate_spell_icons'):
                if school_old != mgef['school']:
                    mgef['ITEX'] = make_new_icon(mgef)

    output_names = ['spellmod', 'everything']
    outputs.update({'MGEF': mgefs_new}, output_names)
Example #13
0
def parse_normal(r, parse_method, dict_method, table):
    dataset = table
    start_time = time.time()
    #topic_method(r)
    file_path_list, file_name_list = utility.get_file_list(r)
    dict = dict_method(r, table, parse_method)

    madlib = open(dataset + '.madlib', 'w')

    file_count = len(file_path_list)

    for i in range(0, file_count):
        lines = parse_method(file_path_list[i])
        words = utility.get_file_words(lines)
        if len(words) == 0: continue
        #for madlib

        line = utility.get_madlib_line(words, dict)
        madlib.write(('%s : %s') % (str(i + 1), line))

    madlib.close()
    elapsed_time = time.time() - start_time
    print r, 'parsing time:', elapsed_time
Example #14
0
def parse_normal(r, parse_method, dict_method, table):
    dataset = table
    start_time = time.time()
    #topic_method(r)
    file_path_list, file_name_list = utility.get_file_list(r)
    dict  = dict_method(r, table, parse_method)

    madlib = open(dataset + '.madlib', 'w')

    file_count = len(file_path_list)

    for i in range(0, file_count):
        lines = parse_method(file_path_list[i])
        words = utility.get_file_words(lines)
        if len(words) == 0: continue
        #for madlib
        
        line = utility.get_madlib_line(words, dict)
        madlib.write(('%s : %s') % (str(i + 1), line))

    madlib.close()
    elapsed_time = time.time() - start_time
    print r, 'parsing time:', elapsed_time
Example #15
0
# find mod files by looking in various paths, etc.

from parameters import *
import utility as ut
import index_data as idata
import struct
import os

data_files = []
for f in ut.get_file_list(MW_DATA_PATH):
    if not 'orrow' in f: continue
    if '.esm' in f: data_files.append(os.path.abspath(f))
#data_files = ["test.esm"]

Example #16
0
from parameters import *
import utility as ut
import struct
import os

# provides mappings for morrowind's enumerated data types such as magic effects

index_data_dir = 'data/index_data'
index_data_dir = os.path.abspath(index_data_dir)
index_data = {}
aliases = {}

index_data_files = ut.get_file_list(index_data_dir)


def try_subkeys(k, ref):
    k = ut.reductions(k)
    if not isinstance(ref, dict):
        ref = index_data[ref]

    for key in ref:
        if isinstance(key, str):
            if k in key:
                return key
    return None


def complete_key(k, data_type):
    return find_key(find_key(k, data_type), data_type)

Example #17
0
archive_types = {
    'rar': {
        'command': 'unrar x'
    },
    'zip': {
        'command': 'unzip'
    },
    '7z': {
        'command': '7za x'
    },
}

dl_path = os.path.abspath(os.path.expanduser(MOD_DIRECTORY) + '/downloads')
extract_path_base = os.path.abspath(
    os.path.expanduser(MOD_DIRECTORY) + '/extracted')
for f in ut.get_file_list(dl_path):
    for a, arc in archive_types.iteritems():
        if f.lower().endswith(a):

            extract_path = os.path.abspath(extract_path_base + '/' +
                                           os.path.basename(f))
            if not os.path.exists(extract_path):
                os.makedirs(extract_path)


def get_file_list(base_path):
    res = []
    for fname in os.listdir(base_path):
        path = os.path.join(base_path, fname)
        if os.path.isdir(path): continue
        res.append(path)
Example #18
0
        bag_of_word = process_file(file_name)
        x.append(bag_of_word)
    if training:
        text_counts = tf.fit_transform(x)
    else:
        text_counts = tf.transform(x)
    return text_counts


if __name__ == "__main__":
    #trying to use the same cv to preserve the dictionary
    cv = CountVectorizer(analyzer='word', lowercase=True, ngram_range=(1, 1))
    tf = TfidfVectorizer(input='content')

    vectorizer = cv
    file_names = utility.get_file_list()
    # using different way to vectorize
    x = convert_file_to_array(vectorizer, file_names, True)
    # x = convert_file_to_array(tf,file_names, True)
    y = utility.get_target_values(file_names)

    #use the build-in split function to validate the prediction:
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    clf = MultinomialNB(alpha=1.0).fit(X_train, y_train)
    predicted = clf.predict(X_test)

    #pick 3 data point and predict:
    '''
    clf = MultinomialNB(alpha=1.0 ).fit(x,y)
    test_file_names  = ['datafeeder\\data\\2020_03_05\\feed.txt',
        'datafeeder\\data\\2020_03_27\\feed.txt',
Example #19
0
from parameters import *
import utility as ut
import index_data as idata
import esm

import struct
import os
import schema
import utility as ut
import materials as mat

input_dir = os.path.abspath("content/item_types")
input_files = ut.get_file_list(input_dir)
item_types = {}

for f in input_files:
    data = ut.read_newline_sep(f)
    rec_type = (os.path.basename(f)).split('_')[0]
    item_types[rec_type] = {}
    for d in data:
        schemename = rec_type + '_type'
        t = {}
        name = ut.reductions(d[0])
        if rec_type == 'WEAP': d = d[1:]

        if (len(d) > 1):
            t = schema.decode_plaintext(d, schemename)
        item_types[rec_type][name] = t

#ITEM_CLASSES = ['ARMO', 'WEAP']
ITEM_CLASSES = ['WEAP']
Example #20
0
from parameters import *
import utility as ut
import index_data as idata
import esm

import struct
import os
import schema
import outputs
import item_types

input_dir = os.path.abspath("content/items")
input_files = {}
for d in ut.get_dir_list(input_dir):
    input_files[os.path.basename(d)] = ut.get_file_list(d)

new_items = {}

def read_id(s, rtype):
    return {'NAME' : s}

def read_enam(s, rtype):
    return {'ENAM' : PREFIX+s+NULL}

def read_copy(s, rtype):
    if s in esm.records_original[rtype]:
        return esm.records_original[rtype][s]

def read_type(s, rtype):
    s = s.split()
    res = {'material' : s[0],
Example #21
0
from parameters import *
import utility as ut
import index_data as idata
import esm

import struct
import os

SCHEME_DIR = os.path.abspath('data/schema')

scheme_files = ut.get_file_list(SCHEME_DIR)

scheme_data_mappings = {
    'float': 'f',
    'long': 'i',
    'short': 'h',
    'longflag': 'i',
    'byte': 'b',
}
input_data_mappings = {
    'float': 'f',
    'long': 'f',
    'short': 'f',
    'longflag': 'i',
    'byte': 'b',
}
key_type_functions = {
    'f': float,
    'i': int,
    'h': int,
    'b': int,
Example #22
0
    comments = []
    for l in f.readlines():
        if l[0] == '#': comments.append(l)
        else: script_body += l

    for c in comments:
        if 'startscript' in c:
            make_startscript(script_name)

    exec(script_body)
    s = startscript(script_name) + s + endscript(script_name)
    new_scripts[script_name] = pack_script(s, script_name)


script_dir = os.path.abspath('data/scripts')
sfiles = ut.get_file_list(script_dir)
for s in sfiles:
    make_script(s)

globdata = {
    "player_current_buffs": ('l', 0),
    "player_buff_points": ('l', 5),
}

globs = {}
for glob, info in globdata.iteritems():
    res = {}
    res['NAME'] = [PREFIX + glob + NULL]
    res['FNAM'] = [struct.pack('c', info[0])]
    res['FLTV'] = [struct.pack('<i', info[1])]
    globs[PREFIX + glob] = res