def test(character_idx_map, options, params, path, filename, batch_size=512): X = tools.prepareData(character_idx_map, path, test=True) dropout = (1 - options['dropout_rate']) * np.ones( (options['ndims'], ), dtype=theano.config.floatX) start, n = 0, len(X) idx_list = range(n) lens = [len(x) for x in X] idx_list = sorted(idx_list, cmp=lambda x, y: cmp(lens[x], lens[y])) Y = [] print 'count_test_sentences', len(X) for i in range(n // batch_size): batch_idx = idx_list[start:start + batch_size] x = [X[t] for t in batch_idx] x_lens = [lens[t] for t in batch_idx] x = tools.asMatrix(x) sY = tools.segment(params, options, x, x_lens, dropout) Y.extend(sY) start += batch_size if start != n: batch_idx = idx_list[start:] x = [X[t] for t in batch_idx] x_lens = [lens[t] for t in batch_idx] x = tools.asMatrix(x) sY = tools.segment(params, options, x, x_lens, dropout) Y.extend(sY) table = {} nb = 0 for idx in idx_list: table[idx] = nb nb += 1 output_result(Y, table, path, filename)
def test(character_idx_map, options, params, path, filename, batch_size = 512 ): X = tools.prepareData(character_idx_map,path,test=True) dropout = (1-options['dropout_rate'])*np.ones((options['ndims'],), dtype=theano.config.floatX) start,n = 0,len(X) idx_list = range(n) lens = [len(x) for x in X] idx_list = sorted(idx_list,cmp = lambda x,y: cmp(lens[x],lens[y])) Y = [] print 'count_test_sentences',len(X) for i in range(n//batch_size): batch_idx = idx_list[start:start+batch_size] x = [X[t] for t in batch_idx] x_lens = [lens[t] for t in batch_idx] x = tools.asMatrix(x) sY = tools.segment(params,options,x,x_lens,dropout) Y.extend(sY) start+=batch_size if start!=n: batch_idx = idx_list[start:] x = [X[t] for t in batch_idx] x_lens = [lens[t] for t in batch_idx] x = tools.asMatrix(x) sY = tools.segment(params,options,x,x_lens,dropout) Y.extend(sY) table = {} nb= 0 for idx in idx_list: table[idx] = nb nb+=1 output_result(Y,table,path,filename)
def write2traindata(filename, modelfolder, tempfolderpath, output): in_file = filename + '.xml' sen_file = in_file + '.sen' ent_file = in_file + '.ent' tools.split_sentence(in_file, sen_file) dic_sen_ent = tools.get_sentence_entity(sen_file, ent_file) seg_model = os.path.join(modelfolder, 'segmenter.model') pos_model = os.path.join(modelfolder, 'postagger.model') for k, v in dic_sen_ent.items(): if len(v) == 0: continue entities = sorted(v, key=lambda ent: ent.start_pos) sentence, title = k.split('###')[:2] sentence_input = sentence.replace('\t', ' ') sentence_input = sentence_input.replace('\r\n', '') sentence_uniform = tools.uniform(sentence_input) sentence_seg = tools.segment(sentence_uniform, seg_model, tempfolderpath) seg_tags = tools.getSegmentTags(sentence_uniform, sentence_seg) words, postags = tools.pos(sentence_seg, pos_model, tempfolderpath) pos_tags = tools.getPosTags(sentence_uniform, words, postags) ent_tags = tools.getEntityTags(sentence_uniform, entities) #character, seg_tag, pos_tag, ent_tag sentence_unicode = sentence_uniform.decode('utf8') j = 0 for i in range(len(sentence_unicode)): if sentence_unicode[i] == ' ': continue else: output.write('%s\t%s\t%s\t%s\n' % (sentence_unicode[i], seg_tags[j], seg_tags[j] + '-' + pos_tags[j], ent_tags[i])) j += 1 output.write('\n')
def boundaryFeatureGeneration(sentence, entities_in_sen, ebao_dic, label_type, tag_strategy): sentence_seg = tools.segment(sentence) # 输入的句子无空格 if len(entities_in_sen) > 0: # 考虑到标注数据的训练文件中可能出现的没有实体的句子 entity_position_list, sentence_reseg = get_entity_pos( entities_in_sen, sentence, sentence_seg) else: sentence_reseg = sentence_seg # 词列表, 词性列表 word_list, postag_list = tools.pos(sentence_reseg) # 字对应的分词、词性标记 char_seg_tag_list, char_pos_tag_list = getCharSegPosTag( word_list, postag_list) #无空格 # 字对应的实体标记, 字对应的字典匹配边界标记 if label_type == 'demo': sen_no_use, ents_matched = tools.matchEntityCombine(sentence, ebao_dic) ents_matched = string2Entity(ents_matched) dic_match_tag_list = getCharEntityFullTag(sentence.decode('utf-8'), ents_matched, ebao_dic) # 有类型 char_entity_tag_list = [[sentence.decode('utf-8')[i], ['O']] for i in range(len(sentence.decode('utf-8')))] else: if label_type == 'full': char_entity_tag_list = getCharEntityFullTag( sentence.decode('utf-8'), entities_in_sen, ebao_dic) sen_no_use, ents_matched = tools.matchEntityCombine( sentence, ebao_dic) ents_matched = string2Entity(ents_matched) dic_match_tag_list = getCharEntityFullTag(sentence.decode('utf-8'), ents_matched, ebao_dic) # 有类型 if label_type == 'partial': char_entity_tag_list = getCharEntityFPTag(sentence.decode('utf-8'), entities_in_sen, tag_strategy) dic_match_tag_list = getCharEntityFPTag(sentence.decode('utf-8'), entities_in_sen, '1') # 无类型 char_entity_tag_list = getCharEntityPartialTag( char_entity_tag_list, tag_strategy) # 生成 字-分词标记-词性标记 的list char_seg_pos_list = [] for i in range(len(char_seg_tag_list)): char_seg_pos_list.append( (char_seg_tag_list[i][0], char_seg_tag_list[i][1], char_pos_tag_list[i][1])) bos = '__BOS__' eos = '__EOS__' new_csp_list = [(bos, bos, bos), (bos, bos, bos)] + char_seg_pos_list + [(eos, eos, eos), (eos, eos, eos)] length = len(new_csp_list) features = '' tags_in_sentence = [] for i in range(2, length - 2): feature_vec = [] # 字特征 feature_vec += [ new_csp_list[i - 2][0], new_csp_list[i - 1][0], new_csp_list[i][0], new_csp_list[i + 1][0], new_csp_list[i + 2][0], new_csp_list[i - 2][0] + '/' + new_csp_list[i - 1][0], new_csp_list[i - 1][0] + '/' + new_csp_list[i][0], new_csp_list[i][0] + '/' + new_csp_list[i + 1][0], new_csp_list[i + 1][0] + '/' + new_csp_list[i + 2][0] ] # 分词标记特征 feature_vec += [ new_csp_list[i - 2][1], new_csp_list[i - 1][1], new_csp_list[i][1], new_csp_list[i + 1][1], new_csp_list[i + 2][1], new_csp_list[i - 2][1] + '/' + new_csp_list[i - 1][1], new_csp_list[i - 1][1] + '/' + new_csp_list[i][1], new_csp_list[i][1] + '/' + new_csp_list[i + 1][1], new_csp_list[i + 1][1] + '/' + new_csp_list[i + 2][1] ] # 词性标记特征 feature_vec += [ new_csp_list[i - 2][2], new_csp_list[i - 1][2], new_csp_list[i][2], new_csp_list[i + 1][2], new_csp_list[i + 2][2], new_csp_list[i - 2][2] + '/' + new_csp_list[i - 1][2], new_csp_list[i - 1][2] + '/' + new_csp_list[i][2], new_csp_list[i][2] + '/' + new_csp_list[i + 1][2], new_csp_list[i + 1][2] + '/' + new_csp_list[i + 2][2] ] # 字符类型特征 feature_vec += [ isSpecial(new_csp_list[i - 1][0]), isSpecial(new_csp_list[i][0]), isSpecial(new_csp_list[i + 1][0]), isSpecial(new_csp_list[i - 1][0]) + '/' + isSpecial(new_csp_list[i][0]), isSpecial(new_csp_list[i][0]) + '/' + isSpecial(new_csp_list[i + 1][0]) ] # 字典特征 dic_boundary_tag = dic_match_tag_list[i - 2][1][0] dic_b_tag = dic_boundary_tag[:1] feature_vec += [dic_b_tag] # 实体标记 try: entity_tag = char_entity_tag_list[i - 2][1][0] if '|' in entity_tag and len(re.findall( '-', entity_tag)) > 1: # for partial tags # if '|' in entity_tag: features += entity_tag else: if '-' in entity_tag: # for training data that contains entity types parts = entity_tag.split('-') if '|' in parts[1]: entity_tag = parts[ 0] + '-entity' # 由于测试集中不带部分标记,这里不做具体处理 ent_tag = parts[0] + '-entity' features += ent_tag else: # O features += entity_tag except IndexError as e: print sentence print i print new_csp_list print char_entity_tag_list return None # for strategy 4, demo if label_type == 'demo': if '-' in dic_boundary_tag: features += '\t' + dic_boundary_tag[0] + '-entity' else: features += '\t' + dic_boundary_tag for j in range(len(feature_vec)): features += '\tf' + str(j) + '=' + str(feature_vec[j]) features += '\n' tags_in_sentence.append(entity_tag) features += '\n' return features, tags_in_sentence
def mainfunction(inputstring, taggerb, taggerc): if inputstring == '': sentence_ner = '请输入句子' return sentence_ner, '', '' # 一些句子预处理 inputsentence = tools.uniformSignal(inputstring) ner_lines = '' bieso = ['B-entity', 'I-entity', 'E-entity', 'S-entity', 'O'] new_term_list = '' segment_list = [] for single_line in inputsentence.split('\n'): lines = tools.sentence_split(single_line) ner_line = '' term_list = '' segment = [] for line in lines: line = line.strip() # # 去除标签部分,以<开头且以>结尾的过滤 # if line == '' or line[0] == '<' and line[-1] == '>' : continue if line == '': continue segment.append(tools.segment(line)) # model_2_layer # boundary feature_string = '' instances = [] feature_string, tags = generateFeature.boundaryFeatureGeneration( line, [], ebao_dic, 'demo', '0') try: instances = feature_string.strip().split('\n') except AttributeError as e: print 'feature_string:%s.' % feature_string xseq = crfsuite.ItemSequence() for instance in instances: fields = instance.split('\t') item = crfsuite.Item() for field in fields[2:]: item.append(crfsuite.Attribute(field)) xseq.append(item) taggerb.set(xseq) yseq_b = taggerb.viterbi() prob_b = taggerb.probability(yseq_b) line_unicode = line.decode('utf-8') model_chosen = '2layer' # class sen_ent_list1, start_end_list1 = evaluation.generateEntList( [yseq_b]) length = len(sen_ent_list1[0]) # length 为0时 sentence = line entities = [] for j in range(length): ent_start = sen_ent_list1[0][j][0] ent_end = sen_ent_list1[0][j][1] ent_type = sen_ent_list1[0][j][2] ent_content = sentence.decode( 'utf-8')[ent_start:ent_end].encode('utf-8') entities.append( Entity(ent_content, ent_start, ent_end, ent_type)) feature_c, sen_ent4error = generateFeature.classFeatureGeneration( sentence, entities, ebao_dic, texttype) instances = feature_c.strip().split('\n\n') ents_type = [] for instance in instances: xseq = crfsuite.ItemSequence() fields = instance.split('\t') item = crfsuite.Item() for field in fields[1:]: item.append(crfsuite.Attribute(field)) xseq.append(item) taggerc.set(xseq) yseq_c = taggerc.viterbi() ents_type.append(yseq_c[0]) new_yseq = ['O' for i in range(len(line_unicode))] for j in range(len(entities)): start = entities[j].start_pos end = entities[j].end_pos if start + 1 == end: new_yseq[start] = 'S-' + ents_type[j] continue new_yseq[start] = 'B-' + ents_type[j] for k in range(start + 1, end - 1): new_yseq[k] = 'I-' + ents_type[j] new_yseq[end - 1] = 'E-' + ents_type[j] sen_ent_colored, ent_list = generateNerInSentence( line_unicode, new_yseq, model_chosen, ebao_dic) new_term_list += ent_list if sen_ent_colored == '': sen_ent_colored = line # ner_lines += '<p>' + sen_ent_colored + '</p>' # ner_lines += '<p>' + ent_list + '</p>' ner_line += sen_ent_colored term_list += ent_list segment_list.append(' '.join(segment)) ner_lines += '<p>' + ner_line + '</p>' ner_lines += '<p>' + term_list + '</p>' ner_lines += '<br/>' segment_str = ' '.join(segment_list) return ner_lines, new_term_list, segment_str
def train_model(max_epoches=30, optimizer=adadelta, batch_size=256, ndims=100, nhiddens=150, dropout_rate=0., regularization=0., margin_loss_discount=0.2, max_word_len=4, start_point=1, load_params=None, resume_training=False, max_sent_len=60, beam_size=4, shuffle_data=True, train_file='../data/train', dev_file='../data/dev', lr=0.2, pre_training='../w2v/c_vecs_100'): options = locals().copy() print 'model options:', options print 'Building model' Cemb, character_idx_map = tools.initCemb(ndims, train_file, pre_training) print '%saving config file' config = {} config['options'] = options config['options']['optimizer'] = optimizer.__name__ config['character_idx_map'] = character_idx_map f = open('config', 'wb') f.write(json.dumps(config)) f.close() print '%resume model building' params = initParams(Cemb, options) if load_params is not None: pp = np.load(load_params) for kk, vv in params.iteritems(): if kk not in pp: raise Warning('%s is not in the archive' % kk) params[kk] = pp[kk] tparams = initTparams(params) if optimizer is adadelta: ms_up, ms_grad = prepare_adadelta(tparams) if optimizer is adagrad: if resume_training: ss_grad = initTparams(np.load('backup.npz')) else: ss_grad = prepare_adagrad(tparams) T_x, T_dropout, T_y, T_yy, T_y_mask, T_yy_mask, T_cost = build_model( tparams, options) weight_decay = (tparams['U']**2).sum() + (tparams['Wy']**2).sum() weight_decay *= regularization T_cost += weight_decay if optimizer is adadelta: T_updates = optimizer(ms_up, ms_grad, tparams, T_cost) elif optimizer is sgd: LR, T_updates = optimizer(tparams, T_cost, lr) elif optimizer is adagrad: T_updates = optimizer(ss_grad, tparams, T_cost, lr) f_update = theano.function( [T_x, T_dropout, T_y, T_yy, T_y_mask, T_yy_mask], T_cost, updates=T_updates) print 'Loading data' seqs, lenss, tagss = tools.prepareData(character_idx_map, train_file) if max_sent_len is not None: survived = [] for idx, seq in enumerate(seqs): if len(seq) <= max_sent_len and len(seq) > 1: survived.append(idx) seqs = [seqs[idx] for idx in survived] lenss = [lenss[idx] for idx in survived] tagss = [tagss[idx] for idx in survived] tot_lens = [len(seq) for seq in seqs] print 'count_training_sentences', len(seqs) print 'Training model' start_time = time.time() for eidx in xrange(max_epoches): batches_idx = get_minibatches_idx(seqs, tot_lens, batch_size, shuffle=shuffle_data) for batch_idx in batches_idx: X = [seqs[t] for t in batch_idx] Y = [lenss[t] for t in batch_idx] Z = [tagss[t] for t in batch_idx] X_lens = [tot_lens[t] for t in batch_idx] params = get_params(tparams) X = tools.asMatrix(X) dropout = np.random.binomial(1, 1 - dropout_rate, (X.shape[1], ndims)).astype( theano.config.floatX) #numpy_start = time.time() YY = tools.segment(params, options, X, X_lens, dropout, margin_loss_discount, Z) #print 'numpy',time.time()-numpy_start Y = tools.asMatrix(Y, transpose=True) YY = tools.asMatrix(YY, transpose=True) Y_mask = (Y / Y).astype(theano.config.floatX) YY_mask = (YY / YY).astype(theano.config.floatX) #theano_start = time.time() f_update(X, dropout, Y, YY, Y_mask, YY_mask) #print 'theano',time.time()-theano_start if optimizer is sgd: LR.set_value(numpy_floatX(LR.get_value() * 0.9)) params = get_params(tparams) test(config['character_idx_map'], config['options'], params, dev_file, '../result/dev_result%s' % (eidx + start_point, )) np.savez('epoch_%s' % (eidx + start_point, ), **params) if optimizer is adagrad: np.savez('backup', **get_params(ss_grad)) end_time = time.time() print 'Trained %s epoch(s) took %.lfs per epoch' % ( eidx + 1, (end_time - start_time) / (eidx + 1))
def train_model( max_epochs = 30, optimizer = adadelta, batch_size = 256, ndims = 100, nhiddens = 150, dropout_rate = 0., regularization = 0., margin_loss_discount = 0.2, max_word_len = 4, start_point = 1, load_params = None, resume_training = False, max_sent_len = 60, beam_size = 4, shuffle_data = True, train_file = '../data/train', dev_file = '../data/dev', lr = 0.2, pre_training = '../w2v/c_vecs_100' ): options = locals().copy() print 'model options:',options print 'Building model' Cemb,character_idx_map = tools.initCemb(ndims,train_file,pre_training) print '%saving config file' config = {} config['options'] = options config['options']['optimizer'] = optimizer.__name__ config['character_idx_map'] = character_idx_map f = open('config','wb') f.write(json.dumps(config)) f.close() print '%resume model building' params = initParams(Cemb,options) if load_params is not None: pp = np.load(load_params) for kk,vv in params.iteritems(): if kk not in pp: raise Warning('%s is not in the archive' % kk) params[kk] = pp[kk] tparams = initTparams(params) if optimizer is adadelta: ms_up,ms_grad = prepare_adadelta(tparams) if optimizer is adagrad: if resume_training: ss_grad = initTparams(np.load('backup.npz')) else: ss_grad = prepare_adagrad(tparams) T_x,T_dropout,T_y,T_yy,T_y_mask,T_yy_mask,T_cost = build_model(tparams,options) weight_decay = (tparams['U']**2).sum()+(tparams['Wy']**2).sum() weight_decay *= regularization T_cost += weight_decay if optimizer is adadelta: T_updates = optimizer(ms_up,ms_grad,tparams,T_cost) elif optimizer is sgd: LR,T_updates = optimizer(tparams,T_cost,lr) elif optimizer is adagrad: T_updates = optimizer(ss_grad,tparams,T_cost,lr) f_update = theano.function([T_x,T_dropout,T_y,T_yy,T_y_mask,T_yy_mask],T_cost,updates=T_updates) print 'Loading data' seqs,lenss,tagss = tools.prepareData(character_idx_map,train_file) if max_sent_len is not None: survived = [] for idx,seq in enumerate(seqs): if len(seq)<=max_sent_len and len(seq)>1: survived.append(idx) seqs = [ seqs[idx] for idx in survived] lenss = [ lenss[idx] for idx in survived] tagss = [ tagss[idx] for idx in survived] tot_lens = [len(seq) for seq in seqs] print 'count_training_sentences',len(seqs) print 'Training model' start_time = time.time() for eidx in xrange(max_epochs): batches_idx = get_minibatches_idx(seqs,tot_lens,batch_size,shuffle=shuffle_data) for batch_idx in batches_idx: X = [seqs[t] for t in batch_idx] Y = [lenss[t] for t in batch_idx] Z = [tagss[t] for t in batch_idx] X_lens = [tot_lens[t] for t in batch_idx] params = get_params(tparams) X = tools.asMatrix(X) dropout = np.random.binomial(1,1-dropout_rate,(X.shape[1],ndims)).astype(theano.config.floatX) #numpy_start = time.time() YY= tools.segment(params,options,X,X_lens,dropout,margin_loss_discount,Z) #print 'numpy',time.time()-numpy_start Y = tools.asMatrix(Y,transpose=True) YY = tools.asMatrix(YY,transpose=True) Y_mask = (Y/Y).astype(theano.config.floatX) YY_mask =(YY/YY).astype(theano.config.floatX) #theano_start = time.time() f_update(X,dropout,Y,YY,Y_mask,YY_mask) #print 'theano',time.time()-theano_start if optimizer is sgd: LR.set_value(numpy_floatX(LR.get_value()*0.9)) params = get_params(tparams) test(config['character_idx_map'],config['options'],params,dev_file,'../result/dev_result%s'%(eidx+start_point,)) np.savez('epoch_%s'%(eidx+start_point,),**params) if optimizer is adagrad: np.savez('backup',**get_params(ss_grad)) end_time = time.time() print 'Trained %s epoch(s) took %.lfs per epoch'%(eidx+1,(end_time-start_time)/(eidx+1))
def clean(): files = [] files.extend([os.path.join(".saved/tags/blocks", f) for f in os.listdir(".saved/tags/blocks") if ((os.path.isdir(os.path.join(".saved/tags/blocks", f)) and not main.segment("minecraft_", 0, f))) or not main.segment("minecraft_", 0, f)]) files.extend([os.path.join(".saved/tags/items", f) for f in os.listdir(".saved/tags/items") if ((os.path.isdir(os.path.join(".saved/tags/items", f)) and not main.segment("minecraft_", 0, f)) or not main.segment("minecraft_", 0, f))]) files.extend([os.path.join(".saved/tags/entity_types", f) for f in os.listdir(".saved/tags/entity_types") if ((os.path.isdir(os.path.join(".saved/tags/entity_types", f)) and not main.segment("minecraft_", 0, f)) or not main.segment("minecraft_", 0, f))]) files.extend([os.path.join(".saved/tags/liquids", f) for f in os.listdir(".saved/tags/liquids") if ((os.path.isdir(os.path.join(".saved/tags/liquids", f)) and not main.segment("minecraft_", 0, f))) or not main.segment("minecraft_", 0, f)]) files.extend([os.path.join(".saved/tags/functions", f) for f in os.listdir(".saved/tags/functions") if ((os.path.isdir(os.path.join(".saved/tags/functions", f)) and not main.segment("minecraft_", 0, f)))]) for f in files: while os.path.isfile(f) or os.path.isdir(f): shutil.rmtree(f)
def genTag(file, packName, packId, useSnapshots): print(f'loading file "{file}"') name = file[:file.index(".mctag")] result = [] code = [] options = [] with open("tags/" + file) as data: for i in data: code.append(i) #print("contents:") code = main.noComments(code) for i in range(0,len(code)): code[i] = code[i].replace(" ", "") #print(f"\t{i}: {code[i]}") t = main.words(":", code[0], [['"','"']], False, False)[1] print(f'type is "{t}"') print(f'Loading "{t}.csv" into memory') with open(f".saved/data/{t}.csv", "r") as csvFile: dictReader = csv.DictReader(csvFile) for i in dictReader: options.append(i) print(f'got {len(options)} entries from "{t}.csv"') def getOption(options,x): split = x.split(":") for i in options: if i["namespace"] == split[0] and i["name"] == split[1]: return i return None print("filtering entries") for line in code[1:]: line = line.strip() if line[0] == "+" or line[0] == "-": workingString = line[1:].strip() workingList = [] if workingString == "all": for i in options: workingList.append(i["namespace"] + ":" + i["name"]) elif main.segment("all", 0, workingString): argString = main.groups(workingString, [["(",")"]], False)[0] if argString[0] == "#": if os.path.exists(f"tags/{argString[1:]}.mctag"): if not (f"{argString[1:]}.mctag" in done): with open(f"tags/{argString[1:]}.mctag", "r") as data: print(f'file "{argString[1:]}.mctag" must be loaded before continuing.') workingList.extend(genTag(f"{argString[1:]}.mctag", packName, packId, useSnapshots)) print(f'continuing to load "{file}"') else: def getEntries(path): result = [] with open(f".saved/tags/{t}/{path}.txt", "r") as data: for i in data: for i2 in i.split(","): i2 = i2.strip() if i2[0] == "#": if not ":" in i2: workingList.extend(getEntries(f"minecraft_{i2[1:]}")) else: workingList.extend(getEntries(i2[1:])) else: if not ":" in i2: result.append("minecraft_" + i2) else: result.append(i2.replace(":", "_")) return result workingList.extend(getEntries(argString[1:].replace(":", "_"))) elif os.path.exists(f".saved/tags/{t}/{argString[1:].replace(':', '_')}.txt"): def getEntries(path): result = [] with open(f".saved/tags/{t}/{path}.txt", "r") as data: for i in data: for i2 in i.split(","): i2 = i2.strip() if i2[0] == "#": if not ":" in i2: workingList.extend(getEntries(f"minecraft_{i2[1:]}")) else: workingList.extend(getEntries(i2[1:].replace(":", "_"))) else: if not ":" in i2: result.append("minecraft_" + i2) else: result.append(i2.replace(":", "_")) return result workingList.extend(getEntries(argString[1:].replace(":", "_"))) else: #The tag isn't defined here. Append it to the pack anyway in case it's defined somewhere else. workingList.append(argString) elif "=" in argString or "<" in argString or ">" in argString: args = main.words(",", argString, [['"','"']], False, False) pars = {} li = [] opCount = 0 for arg in args: match = re.match(r"^(?P<key>.+)(?P<operation>\>=|\<=|!=|==|\>|\<)(?P<value>.+)$", arg) if not match == None: opCount += 1 operation = match.group("operation") key = match.group("key") value = match.group("value") if operation == "==": for i in options: if i[key] == value.lower(): li.append(i["namespace"] + ":" + i["name"]) else: li.remove(i["namespace"] + ":" + i["name"]) elif operation == "!=": for i in options: if not i[key] == value.lower(): li.append(i["namespace"] + ":" + i["name"]) elif (i["namespace"] + ":" + i["name"]) in li: li.remove(i["namespace"] + ":" + i["name"]) elif operation == ">": value = numberCast(value) for i in options: if numberCast(i[key]) > value: li.append(i["namespace"] + ":" + i["name"]) elif (i["namespace"] + ":" + i["name"]) in li: li.remove(i["namespace"] + ":" + i["name"]) elif operation == "<": value = numberCast(value) for i in options: if numberCast(i[key]) < value: li.append(i["namespace"] + ":" + i["name"]) elif (i["namespace"] + ":" + i["name"]) in li: li.remove(i["namespace"] + ":" + i["name"]) elif operation == ">=": value = numberCast(value) for i in options: if numberCast(i[key]) >= value: li.append(i["namespace"] + ":" + i["name"]) elif (i["namespace"] + ":" + i["name"]) in li: li.remove(i["namespace"] + ":" + i["name"]) elif operation == "<=": value = numberCast(value) for i in options: if numberCast(i[key]) <= value: li.append(i["namespace"] + ":" + i["name"]) elif (i["namespace"] + ":" + i["name"]) in li: li.remove(i["namespace"] + ":" + i["name"]) elif "=" in arg: par = main.words("=", arg, [['"','"']], False, False) if not par[0] in pars: pars[par[0]] = [] pars[par[0]].append(par[1]) if opCount == 0: for i in options: li.append(i["namespace"] + ":" + i["name"]) if "sort" in pars: if pars["sort"][-1] == "alphabetical": li = sorted(li) pass else: def value(li1): def inner(x): split = x.split(":") for i in li1: if i["namespace"] == split[0] and i["name"] == split[1]: num = numberCast(i[pars["sort"][-1]]) if not num == -math.inf: return (1,num) else: return (2,i[pars["sort"][-1]]) return (0,x) return inner li = sorted(li, key=value(options)) if "reverse" in pars: if pars["reverse"][-1].lower() == "true": li.reverse() if "limit" in pars: li = li[:min(len(li),int(numberCast(pars["limit"][-1])))] if "in" in pars: for seg in pars["in"]: for i in li: if not seg in i: li.remove(i) if "notin" in pars: for seg in pars["notin"]: for i in li: if seg in i: li.remove(i) for i in li: workingList.append(i) else: reverse = False if argString[0] == "!": argString = argString[1:] reverse = True else: reverse = False for i in options: if argString in i["name"] and not reverse: workingList.append(i["namespace"] + ":" + i["name"]) elif reverse and not argString in i["name"]: workingList.append(i["namespace"] + ":" + i["name"]) elif ":" in workingString: workingList.append(workingString) else: workingList.append("minecraft:" + workingString) if line[0] == "+": for i in workingList: if not i.strip() in result: result.append(i.strip()) elif line[0] == "-": for i in workingList: element = i.strip() if element in result: result.remove(element) elif line == "reverse": result.reverse() elif main.segment("sort", 0, line): argString = main.groups(line, [["(",")"]], False)[0] if argString == "alphabetical": result = sorted(result) pass else: def value(li1): def inner(x): split = x.split(":") for i in li1: if i["namespace"] == split[0] and i["name"] == split[1]: num = numberCast(i[argString]) if not num == -math.inf: return (1,num) else: return (2,i[argString]) return (0,x) return inner result = sorted(result, key=value(options)) elif main.segment("limit", 0, line): argString = main.groups(line, [["(",")"]], False)[0] result = result[:min(len(result),int(numberCast(argString)))] if not useSnapshots: for entry in result: option = getOption(options, entry) if option != None and "snapshot" in option and option["snapshot"].lower() == "true": result.remove(entry) name_split = re.split(r"(/|\\)", name) if len(name_split) > 1: os.makedirs(f".generated/packs/{packName}/data/{packId}/tags/{t}/{'/'.join(name_split[:len(name_split)-1])}", exist_ok=True) with open(f".generated/packs/{packName}/data/{packId}/tags/{t}/{name}.json", "w+") as file1: json.dump({"replace": False, "values":result}, file1,indent=4) if len(name_split) > 1: os.makedirs(f".saved/tags/{t}/{'/'.join(name_split[:len(name_split)-1])}", exist_ok=True) #print(f".saved/tags/{t}/{'/'.join(name_split[:len(name_split)-1])}") with open(f".saved/tags/{t}/{name}.txt", "w+") as data: data.write("\n".join(result)) print(f'done loading "{file}"') print(f'deleting "{t}.csv" from memory to save space') del options done.append(file) return result