Beispiel #1
0
def test(character_idx_map, options, params, path, filename, batch_size=512):

    X = tools.prepareData(character_idx_map, path, test=True)
    dropout = (1 - options['dropout_rate']) * np.ones(
        (options['ndims'], ), dtype=theano.config.floatX)
    start, n = 0, len(X)
    idx_list = range(n)
    lens = [len(x) for x in X]
    idx_list = sorted(idx_list, cmp=lambda x, y: cmp(lens[x], lens[y]))
    Y = []
    print 'count_test_sentences', len(X)

    for i in range(n // batch_size):
        batch_idx = idx_list[start:start + batch_size]
        x = [X[t] for t in batch_idx]
        x_lens = [lens[t] for t in batch_idx]
        x = tools.asMatrix(x)
        sY = tools.segment(params, options, x, x_lens, dropout)
        Y.extend(sY)
        start += batch_size
    if start != n:
        batch_idx = idx_list[start:]
        x = [X[t] for t in batch_idx]
        x_lens = [lens[t] for t in batch_idx]
        x = tools.asMatrix(x)
        sY = tools.segment(params, options, x, x_lens, dropout)
        Y.extend(sY)
    table = {}
    nb = 0
    for idx in idx_list:
        table[idx] = nb
        nb += 1
    output_result(Y, table, path, filename)
Beispiel #2
0
Datei: test.py Projekt: jcyk/CWS
def test(character_idx_map,
         options,
         params,
         path,
         filename,
         batch_size = 512
         ):
    
    X = tools.prepareData(character_idx_map,path,test=True)
    dropout = (1-options['dropout_rate'])*np.ones((options['ndims'],), dtype=theano.config.floatX)
    start,n = 0,len(X)
    idx_list = range(n)
    lens = [len(x) for x in X]
    idx_list = sorted(idx_list,cmp = lambda x,y: cmp(lens[x],lens[y]))
    Y = []
    print 'count_test_sentences',len(X)
    
    for i in range(n//batch_size):
        batch_idx = idx_list[start:start+batch_size]
        x = [X[t] for t in batch_idx]
        x_lens = [lens[t] for t in batch_idx]
        x = tools.asMatrix(x)
        sY = tools.segment(params,options,x,x_lens,dropout)
        Y.extend(sY)
        start+=batch_size
    if start!=n:
        batch_idx = idx_list[start:]
        x = [X[t] for t in batch_idx]
        x_lens = [lens[t] for t in batch_idx]
        x = tools.asMatrix(x)
        sY = tools.segment(params,options,x,x_lens,dropout)
        Y.extend(sY)
    table = {}
    nb= 0
    for idx in idx_list:
        table[idx] = nb
        nb+=1
    output_result(Y,table,path,filename)
Beispiel #3
0
def write2traindata(filename, modelfolder, tempfolderpath, output):
    in_file = filename + '.xml'
    sen_file = in_file + '.sen'
    ent_file = in_file + '.ent'
    tools.split_sentence(in_file, sen_file)
    dic_sen_ent = tools.get_sentence_entity(sen_file, ent_file)

    seg_model = os.path.join(modelfolder, 'segmenter.model')
    pos_model = os.path.join(modelfolder, 'postagger.model')

    for k, v in dic_sen_ent.items():
        if len(v) == 0:
            continue

        entities = sorted(v, key=lambda ent: ent.start_pos)
        sentence, title = k.split('###')[:2]
        sentence_input = sentence.replace('\t', ' ')
        sentence_input = sentence_input.replace('\r\n', '')
        sentence_uniform = tools.uniform(sentence_input)

        sentence_seg = tools.segment(sentence_uniform, seg_model,
                                     tempfolderpath)
        seg_tags = tools.getSegmentTags(sentence_uniform, sentence_seg)

        words, postags = tools.pos(sentence_seg, pos_model, tempfolderpath)
        pos_tags = tools.getPosTags(sentence_uniform, words, postags)

        ent_tags = tools.getEntityTags(sentence_uniform, entities)

        #character, seg_tag, pos_tag, ent_tag
        sentence_unicode = sentence_uniform.decode('utf8')
        j = 0
        for i in range(len(sentence_unicode)):
            if sentence_unicode[i] == ' ':
                continue
            else:
                output.write('%s\t%s\t%s\t%s\n' %
                             (sentence_unicode[i], seg_tags[j],
                              seg_tags[j] + '-' + pos_tags[j], ent_tags[i]))
                j += 1
        output.write('\n')
Beispiel #4
0
def boundaryFeatureGeneration(sentence, entities_in_sen, ebao_dic, label_type,
                              tag_strategy):
    sentence_seg = tools.segment(sentence)  # 输入的句子无空格

    if len(entities_in_sen) > 0:  # 考虑到标注数据的训练文件中可能出现的没有实体的句子
        entity_position_list, sentence_reseg = get_entity_pos(
            entities_in_sen, sentence, sentence_seg)
    else:
        sentence_reseg = sentence_seg

    # 词列表, 词性列表
    word_list, postag_list = tools.pos(sentence_reseg)
    # 字对应的分词、词性标记
    char_seg_tag_list, char_pos_tag_list = getCharSegPosTag(
        word_list, postag_list)  #无空格

    # 字对应的实体标记, 字对应的字典匹配边界标记
    if label_type == 'demo':
        sen_no_use, ents_matched = tools.matchEntityCombine(sentence, ebao_dic)
        ents_matched = string2Entity(ents_matched)
        dic_match_tag_list = getCharEntityFullTag(sentence.decode('utf-8'),
                                                  ents_matched,
                                                  ebao_dic)  # 有类型
        char_entity_tag_list = [[sentence.decode('utf-8')[i], ['O']]
                                for i in range(len(sentence.decode('utf-8')))]
    else:
        if label_type == 'full':
            char_entity_tag_list = getCharEntityFullTag(
                sentence.decode('utf-8'), entities_in_sen, ebao_dic)
            sen_no_use, ents_matched = tools.matchEntityCombine(
                sentence, ebao_dic)
            ents_matched = string2Entity(ents_matched)
            dic_match_tag_list = getCharEntityFullTag(sentence.decode('utf-8'),
                                                      ents_matched,
                                                      ebao_dic)  # 有类型
        if label_type == 'partial':
            char_entity_tag_list = getCharEntityFPTag(sentence.decode('utf-8'),
                                                      entities_in_sen,
                                                      tag_strategy)
            dic_match_tag_list = getCharEntityFPTag(sentence.decode('utf-8'),
                                                    entities_in_sen,
                                                    '1')  # 无类型
            char_entity_tag_list = getCharEntityPartialTag(
                char_entity_tag_list, tag_strategy)

    # 生成 字-分词标记-词性标记 的list
    char_seg_pos_list = []
    for i in range(len(char_seg_tag_list)):
        char_seg_pos_list.append(
            (char_seg_tag_list[i][0], char_seg_tag_list[i][1],
             char_pos_tag_list[i][1]))
    bos = '__BOS__'
    eos = '__EOS__'
    new_csp_list = [(bos, bos, bos),
                    (bos, bos, bos)] + char_seg_pos_list + [(eos, eos, eos),
                                                            (eos, eos, eos)]
    length = len(new_csp_list)
    features = ''
    tags_in_sentence = []
    for i in range(2, length - 2):
        feature_vec = []

        # 字特征
        feature_vec += [
            new_csp_list[i - 2][0], new_csp_list[i - 1][0], new_csp_list[i][0],
            new_csp_list[i + 1][0], new_csp_list[i + 2][0],
            new_csp_list[i - 2][0] + '/' + new_csp_list[i - 1][0],
            new_csp_list[i - 1][0] + '/' + new_csp_list[i][0],
            new_csp_list[i][0] + '/' + new_csp_list[i + 1][0],
            new_csp_list[i + 1][0] + '/' + new_csp_list[i + 2][0]
        ]
        # 分词标记特征
        feature_vec += [
            new_csp_list[i - 2][1], new_csp_list[i - 1][1], new_csp_list[i][1],
            new_csp_list[i + 1][1], new_csp_list[i + 2][1],
            new_csp_list[i - 2][1] + '/' + new_csp_list[i - 1][1],
            new_csp_list[i - 1][1] + '/' + new_csp_list[i][1],
            new_csp_list[i][1] + '/' + new_csp_list[i + 1][1],
            new_csp_list[i + 1][1] + '/' + new_csp_list[i + 2][1]
        ]
        # 词性标记特征
        feature_vec += [
            new_csp_list[i - 2][2], new_csp_list[i - 1][2], new_csp_list[i][2],
            new_csp_list[i + 1][2], new_csp_list[i + 2][2],
            new_csp_list[i - 2][2] + '/' + new_csp_list[i - 1][2],
            new_csp_list[i - 1][2] + '/' + new_csp_list[i][2],
            new_csp_list[i][2] + '/' + new_csp_list[i + 1][2],
            new_csp_list[i + 1][2] + '/' + new_csp_list[i + 2][2]
        ]
        # 字符类型特征
        feature_vec += [
            isSpecial(new_csp_list[i - 1][0]),
            isSpecial(new_csp_list[i][0]),
            isSpecial(new_csp_list[i + 1][0]),
            isSpecial(new_csp_list[i - 1][0]) + '/' +
            isSpecial(new_csp_list[i][0]),
            isSpecial(new_csp_list[i][0]) + '/' +
            isSpecial(new_csp_list[i + 1][0])
        ]

        # 字典特征
        dic_boundary_tag = dic_match_tag_list[i - 2][1][0]
        dic_b_tag = dic_boundary_tag[:1]
        feature_vec += [dic_b_tag]

        # 实体标记
        try:
            entity_tag = char_entity_tag_list[i - 2][1][0]
            if '|' in entity_tag and len(re.findall(
                    '-', entity_tag)) > 1:  # for partial tags
                # if '|' in entity_tag:
                features += entity_tag
            else:
                if '-' in entity_tag:  # for training data that contains entity types
                    parts = entity_tag.split('-')
                    if '|' in parts[1]:
                        entity_tag = parts[
                            0] + '-entity'  # 由于测试集中不带部分标记,这里不做具体处理
                    ent_tag = parts[0] + '-entity'
                    features += ent_tag
                else:  # O
                    features += entity_tag
        except IndexError as e:
            print sentence
            print i
            print new_csp_list
            print char_entity_tag_list
            return None

        # for strategy 4, demo
        if label_type == 'demo':
            if '-' in dic_boundary_tag:
                features += '\t' + dic_boundary_tag[0] + '-entity'
            else:
                features += '\t' + dic_boundary_tag

        for j in range(len(feature_vec)):
            features += '\tf' + str(j) + '=' + str(feature_vec[j])
        features += '\n'
        tags_in_sentence.append(entity_tag)
    features += '\n'
    return features, tags_in_sentence
Beispiel #5
0
def mainfunction(inputstring, taggerb, taggerc):
    if inputstring == '':
        sentence_ner = '请输入句子'
        return sentence_ner, '', ''

    # 一些句子预处理
    inputsentence = tools.uniformSignal(inputstring)

    ner_lines = ''
    bieso = ['B-entity', 'I-entity', 'E-entity', 'S-entity', 'O']
    new_term_list = ''

    segment_list = []
    for single_line in inputsentence.split('\n'):
        lines = tools.sentence_split(single_line)
        ner_line = ''
        term_list = ''
        segment = []
        for line in lines:
            line = line.strip()
            # # 去除标签部分,以<开头且以>结尾的过滤
            # if line == '' or line[0] == '<' and line[-1] == '>' : continue
            if line == '': continue

            segment.append(tools.segment(line))
            # model_2_layer
            # boundary
            feature_string = ''
            instances = []
            feature_string, tags = generateFeature.boundaryFeatureGeneration(
                line, [], ebao_dic, 'demo', '0')
            try:
                instances = feature_string.strip().split('\n')
            except AttributeError as e:
                print 'feature_string:%s.' % feature_string
            xseq = crfsuite.ItemSequence()
            for instance in instances:
                fields = instance.split('\t')
                item = crfsuite.Item()
                for field in fields[2:]:
                    item.append(crfsuite.Attribute(field))
                xseq.append(item)
            taggerb.set(xseq)

            yseq_b = taggerb.viterbi()
            prob_b = taggerb.probability(yseq_b)
            line_unicode = line.decode('utf-8')

            model_chosen = '2layer'
            # class
            sen_ent_list1, start_end_list1 = evaluation.generateEntList(
                [yseq_b])

            length = len(sen_ent_list1[0])
            # length 为0时
            sentence = line
            entities = []
            for j in range(length):
                ent_start = sen_ent_list1[0][j][0]
                ent_end = sen_ent_list1[0][j][1]
                ent_type = sen_ent_list1[0][j][2]
                ent_content = sentence.decode(
                    'utf-8')[ent_start:ent_end].encode('utf-8')
                entities.append(
                    Entity(ent_content, ent_start, ent_end, ent_type))
            feature_c, sen_ent4error = generateFeature.classFeatureGeneration(
                sentence, entities, ebao_dic, texttype)
            instances = feature_c.strip().split('\n\n')
            ents_type = []
            for instance in instances:
                xseq = crfsuite.ItemSequence()
                fields = instance.split('\t')
                item = crfsuite.Item()
                for field in fields[1:]:
                    item.append(crfsuite.Attribute(field))
                xseq.append(item)
                taggerc.set(xseq)
                yseq_c = taggerc.viterbi()
                ents_type.append(yseq_c[0])
            new_yseq = ['O' for i in range(len(line_unicode))]
            for j in range(len(entities)):
                start = entities[j].start_pos
                end = entities[j].end_pos
                if start + 1 == end:
                    new_yseq[start] = 'S-' + ents_type[j]
                    continue
                new_yseq[start] = 'B-' + ents_type[j]
                for k in range(start + 1, end - 1):
                    new_yseq[k] = 'I-' + ents_type[j]
                new_yseq[end - 1] = 'E-' + ents_type[j]

            sen_ent_colored, ent_list = generateNerInSentence(
                line_unicode, new_yseq, model_chosen, ebao_dic)
            new_term_list += ent_list
            if sen_ent_colored == '': sen_ent_colored = line
            # ner_lines += '<p>' + sen_ent_colored + '</p>'
            # ner_lines += '<p>' + ent_list + '</p>'
            ner_line += sen_ent_colored
            term_list += ent_list

        segment_list.append(' '.join(segment))
        ner_lines += '<p>' + ner_line + '</p>'
        ner_lines += '<p>' + term_list + '</p>'
        ner_lines += '<br/>'

    segment_str = ' '.join(segment_list)
    return ner_lines, new_term_list, segment_str
Beispiel #6
0
def train_model(max_epoches=30,
                optimizer=adadelta,
                batch_size=256,
                ndims=100,
                nhiddens=150,
                dropout_rate=0.,
                regularization=0.,
                margin_loss_discount=0.2,
                max_word_len=4,
                start_point=1,
                load_params=None,
                resume_training=False,
                max_sent_len=60,
                beam_size=4,
                shuffle_data=True,
                train_file='../data/train',
                dev_file='../data/dev',
                lr=0.2,
                pre_training='../w2v/c_vecs_100'):
    options = locals().copy()
    print 'model options:', options
    print 'Building model'

    Cemb, character_idx_map = tools.initCemb(ndims, train_file, pre_training)

    print '%saving config file'
    config = {}
    config['options'] = options
    config['options']['optimizer'] = optimizer.__name__
    config['character_idx_map'] = character_idx_map
    f = open('config', 'wb')
    f.write(json.dumps(config))
    f.close()
    print '%resume model building'

    params = initParams(Cemb, options)
    if load_params is not None:
        pp = np.load(load_params)
        for kk, vv in params.iteritems():
            if kk not in pp:
                raise Warning('%s is not in the archive' % kk)
            params[kk] = pp[kk]
    tparams = initTparams(params)
    if optimizer is adadelta:
        ms_up, ms_grad = prepare_adadelta(tparams)
    if optimizer is adagrad:
        if resume_training:
            ss_grad = initTparams(np.load('backup.npz'))
        else:
            ss_grad = prepare_adagrad(tparams)
    T_x, T_dropout, T_y, T_yy, T_y_mask, T_yy_mask, T_cost = build_model(
        tparams, options)
    weight_decay = (tparams['U']**2).sum() + (tparams['Wy']**2).sum()
    weight_decay *= regularization
    T_cost += weight_decay

    if optimizer is adadelta:
        T_updates = optimizer(ms_up, ms_grad, tparams, T_cost)
    elif optimizer is sgd:
        LR, T_updates = optimizer(tparams, T_cost, lr)
    elif optimizer is adagrad:
        T_updates = optimizer(ss_grad, tparams, T_cost, lr)

    f_update = theano.function(
        [T_x, T_dropout, T_y, T_yy, T_y_mask, T_yy_mask],
        T_cost,
        updates=T_updates)

    print 'Loading data'
    seqs, lenss, tagss = tools.prepareData(character_idx_map, train_file)
    if max_sent_len is not None:
        survived = []
        for idx, seq in enumerate(seqs):
            if len(seq) <= max_sent_len and len(seq) > 1:
                survived.append(idx)
        seqs = [seqs[idx] for idx in survived]
        lenss = [lenss[idx] for idx in survived]
        tagss = [tagss[idx] for idx in survived]

    tot_lens = [len(seq) for seq in seqs]
    print 'count_training_sentences', len(seqs)

    print 'Training model'
    start_time = time.time()
    for eidx in xrange(max_epoches):
        batches_idx = get_minibatches_idx(seqs,
                                          tot_lens,
                                          batch_size,
                                          shuffle=shuffle_data)
        for batch_idx in batches_idx:
            X = [seqs[t] for t in batch_idx]
            Y = [lenss[t] for t in batch_idx]
            Z = [tagss[t] for t in batch_idx]
            X_lens = [tot_lens[t] for t in batch_idx]
            params = get_params(tparams)
            X = tools.asMatrix(X)
            dropout = np.random.binomial(1, 1 - dropout_rate,
                                         (X.shape[1], ndims)).astype(
                                             theano.config.floatX)
            #numpy_start = time.time()
            YY = tools.segment(params, options, X, X_lens, dropout,
                               margin_loss_discount, Z)
            #print 'numpy',time.time()-numpy_start
            Y = tools.asMatrix(Y, transpose=True)
            YY = tools.asMatrix(YY, transpose=True)
            Y_mask = (Y / Y).astype(theano.config.floatX)
            YY_mask = (YY / YY).astype(theano.config.floatX)
            #theano_start = time.time()
            f_update(X, dropout, Y, YY, Y_mask, YY_mask)
            #print 'theano',time.time()-theano_start
        if optimizer is sgd:
            LR.set_value(numpy_floatX(LR.get_value() * 0.9))
        params = get_params(tparams)
        test(config['character_idx_map'], config['options'], params, dev_file,
             '../result/dev_result%s' % (eidx + start_point, ))
        np.savez('epoch_%s' % (eidx + start_point, ), **params)
        if optimizer is adagrad:
            np.savez('backup', **get_params(ss_grad))
        end_time = time.time()
        print 'Trained %s epoch(s) took %.lfs per epoch' % (
            eidx + 1, (end_time - start_time) / (eidx + 1))
Beispiel #7
0
Datei: model.py Projekt: jcyk/CWS
def train_model(
    max_epochs = 30,
    optimizer = adadelta,
    batch_size = 256,
    ndims = 100,
    nhiddens = 150,
    dropout_rate = 0.,
    regularization = 0.,
    margin_loss_discount = 0.2,
    max_word_len = 4,
    start_point = 1,
    load_params = None,
    resume_training = False,
    max_sent_len = 60,
    beam_size = 4,
    shuffle_data = True,
    train_file = '../data/train',
    dev_file = '../data/dev',
    lr = 0.2,
    pre_training = '../w2v/c_vecs_100'
):
    options = locals().copy()
    print 'model options:',options
    print 'Building model'
    
    Cemb,character_idx_map = tools.initCemb(ndims,train_file,pre_training)
    
    print '%saving config file'
    config = {}
    config['options'] = options
    config['options']['optimizer'] = optimizer.__name__
    config['character_idx_map'] = character_idx_map
    f = open('config','wb')
    f.write(json.dumps(config))
    f.close()
    print '%resume model building'
    
    params = initParams(Cemb,options)
    if load_params is not None:
        pp = np.load(load_params)
        for kk,vv in params.iteritems():
            if kk not in pp:
                raise Warning('%s is not in the archive' % kk)
            params[kk] = pp[kk]
    tparams = initTparams(params)
    if optimizer is adadelta:
        ms_up,ms_grad = prepare_adadelta(tparams)
    if optimizer is adagrad:
        if resume_training:
            ss_grad = initTparams(np.load('backup.npz'))
        else:
            ss_grad = prepare_adagrad(tparams)
    T_x,T_dropout,T_y,T_yy,T_y_mask,T_yy_mask,T_cost = build_model(tparams,options)
    weight_decay = (tparams['U']**2).sum()+(tparams['Wy']**2).sum()
    weight_decay *= regularization
    T_cost += weight_decay

    if optimizer is adadelta:
        T_updates = optimizer(ms_up,ms_grad,tparams,T_cost)
    elif optimizer is sgd:
        LR,T_updates = optimizer(tparams,T_cost,lr)
    elif optimizer is adagrad:
        T_updates = optimizer(ss_grad,tparams,T_cost,lr)

    f_update = theano.function([T_x,T_dropout,T_y,T_yy,T_y_mask,T_yy_mask],T_cost,updates=T_updates)

    print 'Loading data'
    seqs,lenss,tagss = tools.prepareData(character_idx_map,train_file)
    if max_sent_len is not None:
        survived = []
        for idx,seq in enumerate(seqs):
            if len(seq)<=max_sent_len and len(seq)>1:
                survived.append(idx)
        seqs =  [ seqs[idx]  for idx in survived]
        lenss = [ lenss[idx] for idx in survived]
        tagss = [ tagss[idx] for idx in survived]

    tot_lens = [len(seq) for seq in seqs]
    print 'count_training_sentences',len(seqs)
    
    print 'Training model'
    start_time = time.time()
    for eidx in xrange(max_epochs):
        batches_idx = get_minibatches_idx(seqs,tot_lens,batch_size,shuffle=shuffle_data)
        for batch_idx in batches_idx:
            X = [seqs[t]  for t in batch_idx]
            Y = [lenss[t] for t in batch_idx]
            Z = [tagss[t] for t in batch_idx]
            X_lens = [tot_lens[t] for t in batch_idx]
            params = get_params(tparams)
            X = tools.asMatrix(X)
            dropout = np.random.binomial(1,1-dropout_rate,(X.shape[1],ndims)).astype(theano.config.floatX)
            #numpy_start = time.time()
            YY= tools.segment(params,options,X,X_lens,dropout,margin_loss_discount,Z)
            #print 'numpy',time.time()-numpy_start
            Y = tools.asMatrix(Y,transpose=True)
            YY = tools.asMatrix(YY,transpose=True)
            Y_mask = (Y/Y).astype(theano.config.floatX)
            YY_mask =(YY/YY).astype(theano.config.floatX)
            #theano_start = time.time()
            f_update(X,dropout,Y,YY,Y_mask,YY_mask)
            #print 'theano',time.time()-theano_start
        if optimizer is sgd:
            LR.set_value(numpy_floatX(LR.get_value()*0.9))
        params = get_params(tparams)
        test(config['character_idx_map'],config['options'],params,dev_file,'../result/dev_result%s'%(eidx+start_point,))
        np.savez('epoch_%s'%(eidx+start_point,),**params)
        if optimizer is adagrad:
            np.savez('backup',**get_params(ss_grad))
        end_time = time.time()
        print 'Trained %s epoch(s) took %.lfs per epoch'%(eidx+1,(end_time-start_time)/(eidx+1))
def clean():
  files = []
  files.extend([os.path.join(".saved/tags/blocks", f) for f in os.listdir(".saved/tags/blocks") if ((os.path.isdir(os.path.join(".saved/tags/blocks", f)) and not main.segment("minecraft_", 0, f))) or not main.segment("minecraft_", 0, f)])
  files.extend([os.path.join(".saved/tags/items", f) for f in os.listdir(".saved/tags/items") if ((os.path.isdir(os.path.join(".saved/tags/items", f)) and not main.segment("minecraft_", 0, f)) or not main.segment("minecraft_", 0, f))])
  files.extend([os.path.join(".saved/tags/entity_types", f) for f in os.listdir(".saved/tags/entity_types") if ((os.path.isdir(os.path.join(".saved/tags/entity_types", f)) and not main.segment("minecraft_", 0, f)) or not main.segment("minecraft_", 0, f))])
  files.extend([os.path.join(".saved/tags/liquids", f) for f in os.listdir(".saved/tags/liquids") if ((os.path.isdir(os.path.join(".saved/tags/liquids", f)) and not main.segment("minecraft_", 0, f))) or not main.segment("minecraft_", 0, f)])
  files.extend([os.path.join(".saved/tags/functions", f) for f in os.listdir(".saved/tags/functions") if ((os.path.isdir(os.path.join(".saved/tags/functions", f)) and not main.segment("minecraft_", 0, f)))])

  for f in files:
    while os.path.isfile(f) or os.path.isdir(f):
      shutil.rmtree(f)
def genTag(file, packName, packId, useSnapshots):
  print(f'loading file "{file}"')
  name = file[:file.index(".mctag")]
  result = []
  code = []
  options = []

  with open("tags/" + file) as data:
    for i in data:
      code.append(i)
  
  #print("contents:")
  code = main.noComments(code)
  for i in range(0,len(code)):
    code[i] = code[i].replace(" ", "")
    #print(f"\t{i}: {code[i]}")

  t = main.words(":", code[0], [['"','"']], False, False)[1]
  print(f'type is "{t}"')
  print(f'Loading "{t}.csv" into memory')

  with open(f".saved/data/{t}.csv", "r") as csvFile:
    dictReader = csv.DictReader(csvFile)
    for i in dictReader:
      options.append(i)

  print(f'got {len(options)} entries from "{t}.csv"')

  def getOption(options,x):
    split = x.split(":")
    for i in options:
      if i["namespace"] == split[0] and i["name"] == split[1]:
        return i
    return None

  print("filtering entries")
  for line in code[1:]:
    line = line.strip()
    if line[0] == "+" or line[0] == "-":
      workingString = line[1:].strip()
      workingList = []
      if workingString == "all":
        for i in options:
          workingList.append(i["namespace"] + ":" + i["name"])
      elif main.segment("all", 0, workingString):
        argString = main.groups(workingString, [["(",")"]], False)[0]
        if argString[0] == "#":
          if os.path.exists(f"tags/{argString[1:]}.mctag"):
            if not (f"{argString[1:]}.mctag" in done):
              with open(f"tags/{argString[1:]}.mctag", "r") as data:
                print(f'file "{argString[1:]}.mctag" must be loaded before continuing.')
                workingList.extend(genTag(f"{argString[1:]}.mctag", packName, packId, useSnapshots))
                print(f'continuing to load "{file}"')
            else:
              def getEntries(path):
                result = []
                with open(f".saved/tags/{t}/{path}.txt", "r") as data:
                    for i in data:
                      for i2 in i.split(","):
                        i2 = i2.strip()
                        if i2[0] == "#":
                          if not ":" in i2:
                            workingList.extend(getEntries(f"minecraft_{i2[1:]}"))
                          else:
                            workingList.extend(getEntries(i2[1:]))
                        else:
                          if not ":" in i2:
                            result.append("minecraft_" + i2)
                          else:
                            result.append(i2.replace(":", "_"))

                return result

              workingList.extend(getEntries(argString[1:].replace(":", "_")))
          elif os.path.exists(f".saved/tags/{t}/{argString[1:].replace(':', '_')}.txt"):
            def getEntries(path):
              result = []
              with open(f".saved/tags/{t}/{path}.txt", "r") as data:
                  for i in data:
                    for i2 in i.split(","):
                      i2 = i2.strip()
                      if i2[0] == "#":
                        if not ":" in i2:
                          workingList.extend(getEntries(f"minecraft_{i2[1:]}"))
                        else:
                          workingList.extend(getEntries(i2[1:].replace(":", "_")))
                      else:
                        if not ":" in i2:
                          result.append("minecraft_" + i2)
                        else:
                          result.append(i2.replace(":", "_"))

              return result

            workingList.extend(getEntries(argString[1:].replace(":", "_")))
          else:
            #The tag isn't defined here. Append it to the pack anyway in case it's defined somewhere else.
            workingList.append(argString)
        elif "=" in argString or "<" in argString or ">" in argString:
          args = main.words(",", argString, [['"','"']], False, False)
          pars = {}
          li = []
          opCount = 0

          for arg in args:
            match = re.match(r"^(?P<key>.+)(?P<operation>\>=|\<=|!=|==|\>|\<)(?P<value>.+)$", arg)
            if not match == None:
              opCount += 1
              operation = match.group("operation")
              key = match.group("key")
              value = match.group("value")
              if operation == "==":
                for i in options:
                  if i[key] == value.lower():
                    li.append(i["namespace"] + ":" + i["name"])
                  else:
                    li.remove(i["namespace"] + ":" + i["name"])
              elif operation == "!=":
                for i in options:
                  if not i[key] == value.lower():
                    li.append(i["namespace"] + ":" + i["name"])
                  elif (i["namespace"] + ":" + i["name"]) in li:
                    li.remove(i["namespace"] + ":" + i["name"])
              elif operation == ">":
                value = numberCast(value)
                for i in options:
                  if numberCast(i[key]) > value:
                    li.append(i["namespace"] + ":" + i["name"])
                  elif (i["namespace"] + ":" + i["name"]) in li:
                    li.remove(i["namespace"] + ":" + i["name"])
              elif operation == "<":
                value = numberCast(value)
                for i in options:
                  if numberCast(i[key]) < value:
                    li.append(i["namespace"] + ":" + i["name"])
                  elif (i["namespace"] + ":" + i["name"]) in li:
                    li.remove(i["namespace"] + ":" + i["name"])
              elif operation == ">=":
                value = numberCast(value)
                for i in options:
                  if numberCast(i[key]) >= value:
                    li.append(i["namespace"] + ":" + i["name"])
                  elif (i["namespace"] + ":" + i["name"]) in li:
                    li.remove(i["namespace"] + ":" + i["name"])
              elif operation == "<=":
                value = numberCast(value)
                for i in options:
                  if numberCast(i[key]) <= value:
                    li.append(i["namespace"] + ":" + i["name"])
                  elif (i["namespace"] + ":" + i["name"]) in li:
                    li.remove(i["namespace"] + ":" + i["name"])
            elif "=" in arg:
              par = main.words("=", arg, [['"','"']], False, False)
              if not par[0] in pars:
                pars[par[0]] = []
              pars[par[0]].append(par[1])

          if opCount == 0:
            for i in options:
              li.append(i["namespace"] + ":" + i["name"])

          if "sort" in pars:
            if pars["sort"][-1] == "alphabetical":
              li = sorted(li)
              pass
            else:
              def value(li1):
                def inner(x):
                  split = x.split(":")
                  for i in li1:
                    if i["namespace"] == split[0] and i["name"] == split[1]:
                      num = numberCast(i[pars["sort"][-1]])
                      if not num == -math.inf:
                        return (1,num)
                      else:
                        return (2,i[pars["sort"][-1]])
                  return (0,x)
                return inner
              
              li = sorted(li, key=value(options))

          if "reverse" in pars:
            if pars["reverse"][-1].lower() == "true":
              li.reverse()

          if "limit" in pars:
            li = li[:min(len(li),int(numberCast(pars["limit"][-1])))]
          
          if "in" in pars:
            for seg in pars["in"]:
              for i in li:
                if not seg in i:
                  li.remove(i)
          
          if "notin" in pars:
            for seg in pars["notin"]:
              for i in li:
                if seg in i:
                  li.remove(i)
          for i in li:
            workingList.append(i)
        else:
          reverse = False
          if argString[0] == "!":
            argString = argString[1:]
            reverse = True
          else:
            reverse = False
          
          for i in options:
              if argString in i["name"] and not reverse:
                workingList.append(i["namespace"] + ":" + i["name"])
              elif reverse and not argString in i["name"]:
                workingList.append(i["namespace"] + ":" + i["name"])
          
      elif ":" in workingString:
        workingList.append(workingString)
      else:
        workingList.append("minecraft:" + workingString)

      if line[0] == "+":
        for i in workingList:
          if not i.strip() in result:
            result.append(i.strip())
      elif line[0] == "-":
        for i in workingList:
          element = i.strip()
          if element in result:
            result.remove(element)
    elif line == "reverse":
      result.reverse()
    elif main.segment("sort", 0, line):
      argString = main.groups(line, [["(",")"]], False)[0]
      if argString == "alphabetical":
        result = sorted(result)
        pass
      else:
        def value(li1):
          def inner(x):
            split = x.split(":")
            for i in li1:
              if i["namespace"] == split[0] and i["name"] == split[1]:
                num = numberCast(i[argString])
                if not num == -math.inf:
                  return (1,num)
                else:
                  return (2,i[argString])
            return (0,x)
          return inner

        result = sorted(result, key=value(options))
    elif main.segment("limit", 0, line):
      argString = main.groups(line, [["(",")"]], False)[0]
      result = result[:min(len(result),int(numberCast(argString)))]

  if not useSnapshots:
    for entry in result:
      option = getOption(options, entry)
      if option != None and "snapshot" in option and option["snapshot"].lower() == "true":
        result.remove(entry)

  name_split = re.split(r"(/|\\)", name)

  if len(name_split) > 1:
    os.makedirs(f".generated/packs/{packName}/data/{packId}/tags/{t}/{'/'.join(name_split[:len(name_split)-1])}", exist_ok=True)
  with open(f".generated/packs/{packName}/data/{packId}/tags/{t}/{name}.json", "w+") as file1:
    json.dump({"replace": False, "values":result}, file1,indent=4)

  if len(name_split) > 1:
    os.makedirs(f".saved/tags/{t}/{'/'.join(name_split[:len(name_split)-1])}", exist_ok=True)
    #print(f".saved/tags/{t}/{'/'.join(name_split[:len(name_split)-1])}")
  with open(f".saved/tags/{t}/{name}.txt", "w+") as data:
    data.write("\n".join(result))

  print(f'done loading "{file}"')
  print(f'deleting "{t}.csv" from memory to save space')
  del options

  done.append(file)
  return result