def get_click_user(log_base_dir, day, filters): log_dir_name = os.path.join(log_base_dir, day) print(log_dir_name) print(os.path.abspath(log_dir_name)) print(filters) print(os.path.curdir) file_list = utility.get_file_list(log_dir_name, filters) print("number of log file is %d" % len(file_list)) redis_host = config.get_value("redis_server", "host") redis_port = config.get_int_value("redis_server", "port") redis_password = config.get_value("redis_server", "password") db_index = config.get_int_value("redis_server", "push_id_index") user_id_index = config.get_int_value("rtb_log_index", "user_id") redis_client = connect_redis(host=redis_host, port=redis_port, db_index=db_index, password=redis_password) ad_click_user_map = dict() for file_name in file_list: parse_file(file_name, redis_client, ad_click_user_map, index=user_id_index) return ad_click_user_map
def parse_nsf(r, table): start_time = time.time() dict_in_name = r + '/words.txt' infile = open(dict_in_name, 'r') total_words = [w.strip() for w in infile.readlines()] #for line in infile.readlines(): # total_words.append(line.split()[1].strip()) dict_out_name = table + '.dict' outfile = open(dict_out_name, 'w') outfile.write("'") outfile.write("','".join(total_words)) outfile.write("'") infile.close() outfile.close() file_path_list, file_name_list = utility.get_file_list(r) madlib = open(table + '.madlib', 'w') madlib_out = {} google_out = {} R_out = {} for file_path in file_path_list: if file_path.endswith("docwords.txt"): infile = open(file_path) for line in infile.readlines(): parts = line.split() if int(parts[1]) >= len(total_words): continue if parts[0] in madlib_out: for i in range(0, int(parts[2])): madlib_out[parts[0]] += ',' + parts[1] else: madlib_out[parts[0]] = parts[1] if parts[0] in google_out: google_out[parts[0]] += total_words[ int(parts[1]) - 1] + ' ' + parts[2] + ' ' else: google_out[parts[0]] = total_words[ int(parts[1]) - 1] + ' ' + parts[2] + ' ' if parts[0] in R_out: R_out[parts[0]] += parts[1] + ':' + parts[2] + ' ' else: R_out[parts[0]] = parts[1] + ':' + parts[2] + ' ' j = 1 for key, value in madlib_out.iteritems(): madlib.write(str(j) + ":" + value + '\n') j = j + 1 # for key, value in google_out.iteritems(): # google.write(value + '\n') # for key, value in R_out.iteritems(): # R.write( str(len(value.split(':'))+1) + ' ' + value + '\n') madlib.close() elapsed_time = time.time() - start_time print r, 'parsing time:', elapsed_time
def dict_by_file(r, table, parse_method = None): file_path_list, file_name_list = utility.get_file_list(r) total_words = [] for file_path in file_path_list: if file_path.startswith('all'): total_words += open(file_path, 'r').readlines() dict = utility.unique(total_words) utility.write_dict_file(dict, r.split('/')[-1] + '.dict') return dict
def dict_by_text(r, table, parse_method = None): file_path_list, file_name_list = utility.get_file_list(r) total_words = [] for file_path in file_path_list: lines = parse_method(file_path) total_words += utility.get_file_words(lines) dict = utility.unique(total_words) utility.write_dict_file(dict, table + '.dict') return dict
def dict_by_text(r, table, parse_method=None): file_path_list, file_name_list = utility.get_file_list(r) total_words = [] for file_path in file_path_list: lines = parse_method(file_path) total_words += utility.get_file_words(lines) dict = utility.unique(total_words) utility.write_dict_file(dict, table + '.dict') return dict
def dict_by_file(r, table, parse_method=None): file_path_list, file_name_list = utility.get_file_list(r) total_words = [] for file_path in file_path_list: if file_path.startswith('all'): total_words += open(file_path, 'r').readlines() dict = utility.unique(total_words) utility.write_dict_file(dict, r.split('/')[-1] + '.dict') return dict
def parse_nsf(r, table): start_time = time.time() dict_in_name = r + '/words.txt' infile = open(dict_in_name, 'r') total_words = [w.strip() for w in infile.readlines()] #for line in infile.readlines(): # total_words.append(line.split()[1].strip()) dict_out_name = table + '.dict' outfile = open(dict_out_name, 'w') outfile.write("'") outfile.write("','".join(total_words)) outfile.write("'") infile.close() outfile.close() file_path_list, file_name_list = utility.get_file_list(r) madlib = open(table + '.madlib', 'w') madlib_out = {} google_out = {} R_out = {} for file_path in file_path_list: if file_path.endswith("docwords.txt"): infile = open(file_path) for line in infile.readlines(): parts = line.split() if int(parts[1]) >= len(total_words): continue if parts[0] in madlib_out: for i in range(0, int(parts[2])): madlib_out[parts[0]] += ',' + parts[1] else: madlib_out[parts[0]] = parts[1] if parts[0] in google_out: google_out[parts[0]] += total_words[int(parts[1]) - 1] + ' ' + parts[2] + ' ' else: google_out[parts[0]] = total_words[int(parts[1]) - 1] + ' ' + parts[2] + ' ' if parts[0] in R_out: R_out[parts[0]] += parts[1] + ':' + parts[2] + ' ' else: R_out[parts[0]] = parts[1] + ':' + parts[2] + ' ' j = 1 for key, value in madlib_out.iteritems(): madlib.write(str(j) + ":" + value + '\n') j = j + 1 # for key, value in google_out.iteritems(): # google.write(value + '\n') # for key, value in R_out.iteritems(): # R.write( str(len(value.split(':'))+1) + ' ' + value + '\n') madlib.close() elapsed_time = time.time() - start_time print r, 'parsing time:', elapsed_time
def prepare_reuters21578(r): result = [] total_topics = {} total_texts = {} #outfile = open(r.split('/')[-1]+'.topic','w') file_path_list, file_name_list = utility.get_file_list(r) for file_path in file_path_list: if file_path.endswith("sgm"): parser = utility.MyParser() parser.parse(open(file_path).read()) total_topics.update(parser.get_topics()) total_texts.update(parser.get_texts()) #outfile.close() return total_topics, total_texts
def get_click_pos(log_base_dir, day, filters): log_dir_name = os.path.join(log_base_dir, day) print(log_dir_name) print(os.path.abspath(log_dir_name)) print(filters) print(os.path.curdir) file_list = utility.get_file_list(log_dir_name, filters) print("number of log file is %d" % len(file_list)) redis_client = connect_redis(host="127.0.0.1", port=63791, db_index=13, password="******") ad_click_pos_map = dict() for file_name in file_list: parse_file(file_name, redis_client, ad_click_pos_map, index=24) return ad_click_pos_map
def extract_mod_paths_openmw(modlist): content = modlist.get('settings', 'content') content = content.split('\n') data_dirs = modlist.get('settings', 'data').split('\n') data_dirs = map(lambda x: x.strip('"'), data_dirs) data_dirs = map(lambda x: os.path.abspath(x), data_dirs) res = [] for c in content: full_path = '' for d in data_dirs: flist = ut.get_file_list(d) flist = map(lambda x: os.path.basename(x), flist) if c in flist: full_path = os.path.join(d, c) if not os.path.exists(full_path): print 'Warning: could not find ' + full_path res.append(full_path) return res
def process_mgefs(): icon_dir = os.path.abspath('output/icons') if not os.path.exists(icon_dir): os.makedirs(icon_dir) idata.index_data['mgef_to_school'] = {} for name, mgef in esm.records_original['MGEF'].iteritems(): mgef['icon_base'] = os.path.abspath( 'data/icons/' + mgef['ITEX'][0:-1].lower().strip(NULL).replace( '.tga', '.dds').replace('\\', '/')) index = idata.get(name, 'magic_effects') mgef.update(schema.decode_subrecord(mgef['MEDT'], 'MEDT')) mgef['name'] = name idata.index_data['mgef_to_school'][index] = mgef['school'] if mgef['school'] not in reference_icons: reference_icons[mgef['school']] = mgef['icon_base'] input_dir = os.path.abspath('content/magic_effects') input_files = ut.get_file_list(input_dir) mgefs_new = {} for f in input_files: data = ut.read_newline_sep(f) for d in data: name = d[0] if name not in esm.records_original['MGEF']: continue else: mgefs_new[name] = esm.records_original['MGEF'][name] mgef = mgefs_new[name] school_old = mgef['school'] if len(d) > 1: mgef.update(schema.decode_plaintext(d[1:], 'MEDT')) schema.encode_all_subrecords(mgef) if config.options.getboolean('settings', 'regenerate_spell_icons'): if school_old != mgef['school']: mgef['ITEX'] = make_new_icon(mgef) output_names = ['spellmod', 'everything'] outputs.update({'MGEF': mgefs_new}, output_names)
def parse_normal(r, parse_method, dict_method, table): dataset = table start_time = time.time() #topic_method(r) file_path_list, file_name_list = utility.get_file_list(r) dict = dict_method(r, table, parse_method) madlib = open(dataset + '.madlib', 'w') file_count = len(file_path_list) for i in range(0, file_count): lines = parse_method(file_path_list[i]) words = utility.get_file_words(lines) if len(words) == 0: continue #for madlib line = utility.get_madlib_line(words, dict) madlib.write(('%s : %s') % (str(i + 1), line)) madlib.close() elapsed_time = time.time() - start_time print r, 'parsing time:', elapsed_time
# find mod files by looking in various paths, etc. from parameters import * import utility as ut import index_data as idata import struct import os data_files = [] for f in ut.get_file_list(MW_DATA_PATH): if not 'orrow' in f: continue if '.esm' in f: data_files.append(os.path.abspath(f)) #data_files = ["test.esm"]
from parameters import * import utility as ut import struct import os # provides mappings for morrowind's enumerated data types such as magic effects index_data_dir = 'data/index_data' index_data_dir = os.path.abspath(index_data_dir) index_data = {} aliases = {} index_data_files = ut.get_file_list(index_data_dir) def try_subkeys(k, ref): k = ut.reductions(k) if not isinstance(ref, dict): ref = index_data[ref] for key in ref: if isinstance(key, str): if k in key: return key return None def complete_key(k, data_type): return find_key(find_key(k, data_type), data_type)
archive_types = { 'rar': { 'command': 'unrar x' }, 'zip': { 'command': 'unzip' }, '7z': { 'command': '7za x' }, } dl_path = os.path.abspath(os.path.expanduser(MOD_DIRECTORY) + '/downloads') extract_path_base = os.path.abspath( os.path.expanduser(MOD_DIRECTORY) + '/extracted') for f in ut.get_file_list(dl_path): for a, arc in archive_types.iteritems(): if f.lower().endswith(a): extract_path = os.path.abspath(extract_path_base + '/' + os.path.basename(f)) if not os.path.exists(extract_path): os.makedirs(extract_path) def get_file_list(base_path): res = [] for fname in os.listdir(base_path): path = os.path.join(base_path, fname) if os.path.isdir(path): continue res.append(path)
bag_of_word = process_file(file_name) x.append(bag_of_word) if training: text_counts = tf.fit_transform(x) else: text_counts = tf.transform(x) return text_counts if __name__ == "__main__": #trying to use the same cv to preserve the dictionary cv = CountVectorizer(analyzer='word', lowercase=True, ngram_range=(1, 1)) tf = TfidfVectorizer(input='content') vectorizer = cv file_names = utility.get_file_list() # using different way to vectorize x = convert_file_to_array(vectorizer, file_names, True) # x = convert_file_to_array(tf,file_names, True) y = utility.get_target_values(file_names) #use the build-in split function to validate the prediction: X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2) clf = MultinomialNB(alpha=1.0).fit(X_train, y_train) predicted = clf.predict(X_test) #pick 3 data point and predict: ''' clf = MultinomialNB(alpha=1.0 ).fit(x,y) test_file_names = ['datafeeder\\data\\2020_03_05\\feed.txt', 'datafeeder\\data\\2020_03_27\\feed.txt',
from parameters import * import utility as ut import index_data as idata import esm import struct import os import schema import utility as ut import materials as mat input_dir = os.path.abspath("content/item_types") input_files = ut.get_file_list(input_dir) item_types = {} for f in input_files: data = ut.read_newline_sep(f) rec_type = (os.path.basename(f)).split('_')[0] item_types[rec_type] = {} for d in data: schemename = rec_type + '_type' t = {} name = ut.reductions(d[0]) if rec_type == 'WEAP': d = d[1:] if (len(d) > 1): t = schema.decode_plaintext(d, schemename) item_types[rec_type][name] = t #ITEM_CLASSES = ['ARMO', 'WEAP'] ITEM_CLASSES = ['WEAP']
from parameters import * import utility as ut import index_data as idata import esm import struct import os import schema import outputs import item_types input_dir = os.path.abspath("content/items") input_files = {} for d in ut.get_dir_list(input_dir): input_files[os.path.basename(d)] = ut.get_file_list(d) new_items = {} def read_id(s, rtype): return {'NAME' : s} def read_enam(s, rtype): return {'ENAM' : PREFIX+s+NULL} def read_copy(s, rtype): if s in esm.records_original[rtype]: return esm.records_original[rtype][s] def read_type(s, rtype): s = s.split() res = {'material' : s[0],
from parameters import * import utility as ut import index_data as idata import esm import struct import os SCHEME_DIR = os.path.abspath('data/schema') scheme_files = ut.get_file_list(SCHEME_DIR) scheme_data_mappings = { 'float': 'f', 'long': 'i', 'short': 'h', 'longflag': 'i', 'byte': 'b', } input_data_mappings = { 'float': 'f', 'long': 'f', 'short': 'f', 'longflag': 'i', 'byte': 'b', } key_type_functions = { 'f': float, 'i': int, 'h': int, 'b': int,
comments = [] for l in f.readlines(): if l[0] == '#': comments.append(l) else: script_body += l for c in comments: if 'startscript' in c: make_startscript(script_name) exec(script_body) s = startscript(script_name) + s + endscript(script_name) new_scripts[script_name] = pack_script(s, script_name) script_dir = os.path.abspath('data/scripts') sfiles = ut.get_file_list(script_dir) for s in sfiles: make_script(s) globdata = { "player_current_buffs": ('l', 0), "player_buff_points": ('l', 5), } globs = {} for glob, info in globdata.iteritems(): res = {} res['NAME'] = [PREFIX + glob + NULL] res['FNAM'] = [struct.pack('c', info[0])] res['FLTV'] = [struct.pack('<i', info[1])] globs[PREFIX + glob] = res