def patternStringGenerator(seed):

    if seed.isdigit():  # 纯数字
        length = len(seed)
        # print(length)
        strr = r'\b[0-9]{' + str(length) + r'}\b'
        # print(strr)
        return strr
    else:
        aaa = re.sub("\D", "", seed)
        if len(aaa) > 0:  # seed 中包含数字
            restr = [r'\b']
            for i in seed:
                if i.isalpha():
                    restr.append(r'[a-z]')
                else:
                    if i.isdigit():
                        restr.append(r'\d')
                    else:
                        restr.append(r'.')
            restr.append(r'\b')
            restrr = ''.join(restr)
            # print(restr)
            # print(restrr)
            return restrr
        elif seed.isalpha():
            print("is alpha")
            return None
        else:
            if len(seed) > 30:
                # 增加长字串模糊匹配?
                return None
            cleaningseed = processhelper.simpledatacleaning(seed)
            splists = cleaningseed.split(' ')
            if len(splists) > 0:
                restr = [r'\b', splists[0]]
                if len(splists) < 2:
                    a = r'[a-zA-Z]+'
                    restr.append(r'[^a-zA-Z0-9]*')
                    restr.append(a)
                else:
                    for substring in range(1, len(splists) - 1):
                        a = r'[a-zA-Z]+'
                        restr.append(r'[^a-zA-Z0-9]*')
                        restr.append(a)
                    restr.append(r'[^a-zA-Z0-9]*')
                    restr.append(splists[len(splists) - 1])

                restr.append(r'\b')
                restrr = ''.join(restr)
                print(restrr)
                return restrr
Beispiel #2
0
def AttributeTrainingDataMixGenerator(username, attr_id):
    print(username)
    traingdata = []
    testingdata = []
    validationdata = []
    dataset = Cora_labeled.objects.all()
    entitys = models.sigirCoraToAttrEntity.objects.filter(user=username)
    for record in dataset:
        syns = entitys.filter(cora_id=record.id)

        if syns:
            attrids = [syn.attrsynonym.value.attr_id for syn in syns]
            if attr_id in attrids:
                text = record.cleantext
                dict = {}
                texttemp = []
                for syn in syns:

                    print(syn)
                    synoym = syn.attrsynonym.synonym

                    aa = processhelper.simpledatacleaning(synoym)
                    aalist = aa.split(' ')
                    for item in aalist:
                        if syn.attrsynonym.value.attr_id == attr_id:
                            dict[item] = syn.attrsynonym.value_id
                        else:
                            dict[item] = 'ooo'
                    # text = text.replace(synoym,' '+synoym+'###'+syn.attrsynonym.value.attr.attrname+' ')
                # text = processhelper.simpledatacleaning(text)
                textlist = text.split(' ')
                left = list(set(textlist).difference(set(dict.keys())))
                for item in textlist:
                    if item in dict.keys():
                        texttemp.append(item + '/' + str(dict[item]))
                    else:
                        # texttemp.append(item + '/NULL')
                        print(item + '/NULL')
                for i in range(len(left)):
                    if i < 3:
                        texttemp.append(left[i] + '/NULL')
                traingdata.append(' '.join(texttemp))
            else:
                testingdata.append(record.cleantext)
        else:
            testingdata.append(record.cleantext)

    validationdata = traingdata
    return traingdata, testingdata, validationdata
Beispiel #3
0
def hbfaffinegapOnlyclusterSimple(corahbf, dataset, acr_threshold, username,
                                  dis_threshold):
    subrecords = []
    dataset = dataset.order_by('id')
    record_ids = [data.id for data in dataset]
    for data in dataset:
        # print(data.cleantext)
        subrecord = data.cleantext
        coraent = models.sigirCoraToAttrEntity.objects.filter(user=username,
                                                              cora_id=data.id)
        for cc in coraent:
            syn = cc.attrsynonym.synonym
            cleansyn = processhelper.simpledatacleaning(syn)
            subrecord = subrecord.replace(cleansyn, cc.attrsynonym.value.value)
        subrecords.append(subrecord)
    subgroups = dextrapreclustering.affinegaphClustering(data=subrecords)
    cluster_dict = {}
    for i in range(len(subgroups)):
        cluster_dict[record_ids[i]] = subgroups[i]
    return cluster_dict
Beispiel #4
0
def basicTrainingDataGenerator(username):
    print(username)
    traingdata = []
    testingdata = []
    validationdata = []
    dataset = Cora_labeled.objects.all()
    entitys = models.sigirCoraToAttrEntity.objects.filter(user=username)
    for record in dataset:
        syns = entitys.filter(cora_id=record.id)
        temp = syns.count()
        if temp > 2:
            print(temp)
            text = record.cleantext
            dict = {}
            texttemp = []
            for syn in syns:

                print(syn)
                synoym = syn.attrsynonym.synonym
                aa = processhelper.simpledatacleaning(synoym)
                aalist = aa.split(' ')
                for item in aalist:
                    dict[item] = syn.attrsynonym.value.attr.attrname
                # text = text.replace(synoym,' '+synoym+'###'+syn.attrsynonym.value.attr.attrname+' ')
            # text = processhelper.simpledatacleaning(text)
            textlist = text.split(' ')
            for item in textlist:
                if item in dict.keys():
                    texttemp.append(item + '/' + dict[item])
                else:
                    texttemp.append(item + '/NULL')
            if temp < 4:
                validationdata.append(' '.join(texttemp))
            else:
                traingdata.append(' '.join(texttemp))
        else:
            testingdata.append(record.cleantext)
    return traingdata, testingdata, validationdata
Beispiel #5
0
def collectSubrecords(node, dataset, username):
    # dataset = Cora_labeled.objects.all()
    nodeRecords = node.get_index_recordids()
    edges_dict = node.get_edges_dict()
    # find max common values
    commonValues = []
    for layer in edges_dict.keys():
        layer_edges_dict = edges_dict[layer]
        for nodevalue in layer_edges_dict.keys():
            recordsids = layer_edges_dict[nodevalue]
            if set(nodeRecords).issubset(set(recordsids)):
                commonValues.append(nodevalue)
    print(node.get_attributeValue(), ':', nodeRecords)
    commonValues.append(node.get_attributeValue())
    # print(commonValues)
    datas = dataset.filter(id__in=nodeRecords)
    # record_id_dict = {}
    # for i in range(len(nodeRecords)):
    #     record_id_dict[i] = nodeRecords[i]
    # collect subrecords
    subrecords = []
    for data in datas:
        # print(data.cleantext)
        subrecord = data.cleantext
        coraent = models.sigirCoraToAttrEntity.objects.filter(user=username,
                                                              cora_id=data.id)
        for cc in coraent:
            syn = cc.attrsynonym.synonym
            cleansyn = processhelper.simpledatacleaning(syn)
            if cc.attrsynonym.value.value in commonValues:
                # print(cleansyn)
                subrecord = subrecord.replace(cleansyn, '')
                # print(subrecord)
            else:
                subrecord = subrecord.replace(cleansyn,
                                              cc.attrsynonym.value.value)
        subrecords.append(subrecord)
    return subrecords