Exemple #1
0
def mpNoRank(header_list, thresh=0):
    logging.basicConfig(level=logging.ERROR)
    es = connect_elasticsearch()

    result = {}
    resultLabel = {}
    for elem in header_list:
        elem = elem.replace('_', ' ')
        search_object = {
            "query": {
                "match": {
                    "alias": {
                        "query": elem,
                        "fuzziness": "AUTO"
                    }
                }
            }
        }

        res = search(es, property_index,
                     json.dumps(search_object))['hits']['hits']
        if (len(res) > 0 and res[0]['_score'] > thresh):
            result[elem] = res[0]['_source']['id']
            resultLabel[elem] = res[0]['_source']['label']
        else:
            result[elem] = ''
            resultLabel[elem] = ''
    return result, resultLabel
Exemple #2
0
def mpNoRankWType(header_list, type_list, thresh=0):
    logging.basicConfig(level=logging.ERROR)
    es = connect_elasticsearch()

    result = {}
    resultLabel = {}
    for elem in header_list:
        elem = elem.replace('_', ' ')
        search_object = {
            "query": {
                "bool": {
                    "must": [{
                        "term": {
                            "data type": type_list[header_list.index(elem)]
                        }
                    }, {
                        "bool": {
                            "should": [{
                                "match": {
                                    "aliasId": elem
                                }
                            }, {
                                "match": {
                                    "aliasEn": elem
                                }
                            }]
                        }
                    }]
                }
            }
        }

        res = search(es, property_index,
                     json.dumps(search_object))['hits']['hits']
        if (len(res) > 0 and res[0]['_score'] > thresh):
            result[elem] = res[0]['_source']['id']
            resultLabel[elem] = res[0]['_source']['labelId']
        else:
            result[elem] = ''
            resultLabel[elem] = ''
    return result, resultLabel
Exemple #3
0
def mpRankWTypeSim(header_list, type_list, thresh=0):
    print("AAAA")
    namaFileModel = "data/dump/w2vec_wiki_id_case"
    model = Word2Vec.load(namaFileModel)
    print("kicut")
    logging.basicConfig(level=logging.ERROR)
    es = connect_elasticsearch()
    print("connected")

    result = {}
    resultLabel = {}
    for elem in header_list:
        elem = elem.replace('_', ' ')
        search_object = {
            "from": 0,
            "size": 100,
            "query": {
                "bool": {
                    "must": [{
                        "term": {
                            "data type": type_list[header_list.index(elem)]
                        }
                    }, {
                        "bool": {
                            "should": [{
                                "match": {
                                    "aliasId": elem
                                }
                            }, {
                                "match": {
                                    "aliasEn": elem
                                }
                            }]
                        }
                    }]
                }
            }
        }

        res = search(es, property_index,
                     json.dumps(search_object))['hits']['hits']
        if (len(res) > 0):
            words = elem.split()
            for item in res:
                sim_score = 0
                label = item['_source']['labelId']
                alias = item['_source']['aliasId']
                lwords = label.split()
                for n in range(len(words)):
                    try:
                        sim_score = sim_score + model.similarity(
                            words[n].lower(), lwords[n].lower())
                    except:
                        pass

                for alt in alias:
                    temp = 0
                    alt_words = alt.split()
                    for n in range(len(words)):
                        try:
                            temp = temp + model.similarity(
                                words[n].lower(), alt_words[n].lower())
                        except:
                            pass
                    if (temp > sim_score):
                        sim_score = temp
                item['_score'] = sim_score
                newlist = sorted(res, key=lambda k: k['_score'], reverse=True)
                result[elem] = newlist[0]['_source']['id']
                resultLabel[elem] = newlist[0]['_source']['labelId']
        else:
            result[elem] = ''
            resultLabel[elem] = ''
    return result, resultLabel
Exemple #4
0
def mapProperty(header_list, protagonist):
    logging.basicConfig(level=logging.ERROR)
    es = connect_elasticsearch()

    result = {}
    resultLabel = {}
    for elem in header_list:
        elem = elem.replace('_', ' ')
        search_object = {
            'query': {
                'multi_match': {
                    'query': elem,
                    'fields': 'alias'
                }
            }
        }

        res = search(es, property_index,
                     json.dumps(search_object))['hits']['hits']
        if (len(res) > 0):
            found = False
            for cand in res:
                if (elem.lower() in cand['_source']['label']):
                    result[elem] = cand['_source']['id']
                    resultLabel[elem] = cand['_source']['label']
                    found = True
                    break
            if (not found):
                for cand in res:
                    if (elem.lower() in cand['_source']['alias']):
                        result[elem] = cand['_source']['id']
                        result[elem] = cand['_source']['label']
                        found = True
                        break
            if (not found):
                split = elem.split()
                for word in split:
                    for cand in res:
                        for candword in cand['_source']['alias']:
                            if (word.lower() in candword):
                                result[elem] = cand['_source']['id']
                                resultLabel[elem] = cand['_source']['label']
                                found = True
                                break
                        if (found):
                            break
                    if (found):
                        break
            if (not found):
                for cand in res:
                    if (elem.lower() in cand['_source']['description']):
                        result[elem] = cand['_source']['id']
                        resultLabel[elem] = cand['_source']['label']
                        found = True
                        break
            if (not found):
                minDist = 10000
                candItem = None
                temp = 0
                for cand in res:
                    for alt in cand['_source']['alias']:
                        temp = min(minDist, lDistance(elem.lower(), alt))
                    if (temp < minDist):
                        candItem = cand
                        minDist = temp

                if (candItem['_score'] > 18 and minDist < 8):
                    result[elem] = cand['_source']['id']
                    resultLabel[elem] = cand['_source']['label']

    return result, resultLabel
Exemple #5
0
def mpRankWType(header_list, type_list, thresh=0):
    logging.basicConfig(level=logging.ERROR)
    es = connect_elasticsearch()

    result = {}
    resultLabel = {}
    for elem in header_list:
        elem = elem.replace('_', ' ')
        search_object = {
            "query": {
                "bool": {
                    "must": [{
                        "term": {
                            "data type": type_list[header_list.index(elem)]
                        }
                    }, {
                        "bool": {
                            "should": [{
                                "match": {
                                    "aliasId": elem
                                }
                            }, {
                                "match": {
                                    "aliasEn": elem
                                }
                            }]
                        }
                    }]
                }
            }
        }

        res = search(es, property_index,
                     json.dumps(search_object))['hits']['hits']
        if (len(res) > 0):
            found = False
            for cand in res:
                if (elem.lower() in cand['_source']['label']):
                    result[elem] = cand['_source']['id']
                    resultLabel[elem] = cand['_source']['label']
                    found = True
                    break
                if (not found):
                    for cand in res:
                        if (elem.lower() in cand['_source']['alias']):
                            result[elem] = cand['_source']['id']
                            result[elem] = cand['_source']['label']
                            found = True
                            break
                if (not found):
                    split = elem.split()
                    for word in split:
                        for cand in res:
                            for candword in cand['_source']['alias']:
                                if (word.lower() in candword):
                                    result[elem] = cand['_source']['id']
                                    resultLabel[elem] = cand['_source'][
                                        'label']
                                    found = True
                                    break
                            if (found):
                                break
                        if (found):
                            break
                if (not found):
                    for cand in res:
                        if (elem.lower() in cand['_source']['description']):
                            result[elem] = cand['_source']['id']
                            resultLabel[elem] = cand['_source']['label']
                            found = True
                            break
                if (not found):
                    minDist = 10000
                    candItem = None
                    temp = 0
                    for cand in res:
                        for alt in cand['_source']['alias']:
                            temp = min(minDist, lDistance(elem.lower(), alt))
                        if (temp < minDist):
                            candItem = cand
                            minDist = temp

                    if (candItem['_score'] > 18 and minDist < 8):
                        result[elem] = cand['_source']['id']
                        resultLabel[elem] = cand['_source']['label']
                    else:
                        result[elem] = ''
                        resultLabel[elem] = ''

    return result, resultLabel