예제 #1
0
def process_word_E(question):
    #print(question)
    startTime = time.time()
    global count
    k = 1

    entities = []
    #question=question[0].lower() + question[1:]
    question = question.replace("?", "")
    question = question.replace(".", "")
    question = question.replace("!", "")
    question = question.replace("'s", "")
    question = question.replace("'", "")
    question = question.replace("\\", "")
    question = question.replace("#", "")

    try:
        entityResults = searchIndex.entitySearch(question)
    except:
        return []
    for entity in sorted(entityResults, reverse=True, key=lambda x: x[2])[:k]:
        entities.append(entity)

    #print("Entities:")
    #print(entities)
    return [[entity[1], entity[2]] for entity in entities]
예제 #2
0
def process_word_E_long(question):
    #print(question)
    #startTime=time.time()
    global count
    k = 1

    entities = []

    #question=question[0].lower() + question[1:]
    originalQuestion = question
    question = question.replace("?", "")
    question = question.replace(".", "")
    question = question.replace("!", "")
    question = question.replace("'s", "")
    question = question.replace("'", "")
    question = question.replace("\\", "")
    question = question.replace("#", "")
    question = question[0].lower() + question[1:]
    questionStopWords = stopwords.extract_stop_words_question(
        question, stopWordsList)
    combinations = get_question_combinatios(question, questionStopWords)
    combinations = split_base_on_verb(combinations, originalQuestion)
    for idx, term in enumerate(combinations):
        if len(term) == 0:
            continue
        if term[0].istitle():
            continue
        ontologyResults = searchIndex.ontologySearch(term)
        propertyResults = searchIndex.propertySearch(term)
        if len(ontologyResults) > 2 or len(propertyResults) > 0:
            del combinations[idx]

    combinations = merge_comb_stop_words(combinations, question,
                                         questionStopWords)
    combinations = sort_combinations(combinations, question)
    combinations = merge_entity_prefix(question, combinations,
                                       originalQuestion)
    combinations, compare_found = split_bas_on_comparison(combinations)
    combinations = extract_abbreviation(combinations)
    try:
        for term in combinations:
            #print(term)
            entityResults = searchIndex.entitySearch(term)
            if len(entityResults) > 0:
                entities.append([entity + [term] for entity in entityResults])

    except:
        return []
    results = []
    for raw in entities:
        for entity in sorted(raw, reverse=True, key=lambda x: x[2])[:k]:
            results.append(entity)

    #print("Entities:")
    #print(entities)
    return [[entity[1], entity[4]] for entity in results]
예제 #3
0
파일: main.py 프로젝트: dice-group/LAUREN
def evaluate(raw):
    
    evaluation=True
    startTime=time.time()
    oneQuestion=False
    global correctRelations
    global wrongRelations
    global correctEntities
    global wrongEntities
    global count
    correctRelations = 0
    wrongRelations = 0
    correctEntities = 0
    wrongEntities = 0
    count = 1
    p_entity=0
    r_entity=0
    p_relation=0
    r_relation=0
    k=1
    correct=True
    questionRelationsNumber=0
    entities=[]
    questionWord=raw[0].strip().split(' ')[0]
    mixedRelations=[]
    #beforeMixRelations=[]
    question=raw[0]
    originalQuestion=question
    question=question[0].lower() + question[1:]
    question=question.replace("?","")
    question=question.replace(".","")
    question=question.replace("!","")
    #question=question.replace("'s","")
    #question=question.replace("'","")
    question=question.replace("\\","")
    question=question.replace("#","")
    questionStopWords=stopwords.extract_stop_words_question(question,stopWordsList)
    combinations=get_question_combinatios(question,questionStopWords)
    #combinations=merge_comb_stop_words(combinations,question,questionStopWords)
    #print(combinations)
    combinations=split_base_on_verb(combinations,originalQuestion)
    #combinations=split_base_on_titles(combinations)
    #print(combinations)
    combinations=split_base_on_s(combinations)
    oldCombinations=combinations
    
    for idx,term in enumerate(combinations):
        if len(term)==0:
            continue
        if term[0].istitle():
            continue;
        ontologyResults=searchIndex.ontologySearch(term)
        propertyResults=searchIndex.propertySearch(term)
        if len(ontologyResults) == 0 and len(propertyResults) == 0:
            combinations[idx]=term.capitalize()
            question=question.replace(term,term.capitalize())
            
    combinations=merge_comb_stop_words(combinations,question,questionStopWords)
    combinations=sort_combinations(combinations,question)
    combinations=merge_entity_prefix(question,combinations,originalQuestion)
    combinations,compare_found=split_bas_on_comparison(combinations)
    combinations=extract_abbreviation(combinations)
    #print(combinations)
    i=0
    nationalityFlag=False
    for term in combinations:
        #print(term)
        relations=[]
        properties=[]
        entities_term=[]
        if len(term)==0:
            continue
        #relations=reRank_results(term,relations)
        countryImprovement=realtions_entities_country_improvement(term)
        if countryImprovement != "":
            #print("hi")
            nationalityFlag=True
            entities.append([["country",countryImprovement,0,20,term]])
        if (not word_is_verb(term,originalQuestion)) and (term[0].istitle() or len(term.split(' ')) > 2 or (len(term)>1 and  len(searchIndex.ontologySearch(term)) < 2 ) or (any(x.isupper() for x in term))) :
            #print("hi")
            
            entityResults=searchIndex.entitySearch(term)
            if " and " in term:
                for word in term.split(' and '):
                    entityResults.extend(searchIndex.entitySearch(word.strip()))
            if " or " in term:
                for word in term.split(' or '):
                    entityResults.extend(searchIndex.entitySearch(word.strip()))
            if len(entityResults)!=0:
                for result in entityResults:
                    if result[1] not in [e[1] for e in entities_term]:
                        entities_term.append(result+[term])
                #print(len(entities_term))
                entities.append(entities_term)
                    #print(entities)
        else:
            ontologyResults=searchIndex.ontologySearch(term)
            if len(ontologyResults)!=0:
                for result in ontologyResults:
                    if not (result[1][result[1].rfind('/')+1:][0].istitle()):
                        relations.append(result+[term])
            propertyResults=searchIndex.propertySearch(term)
            if len(propertyResults)!=0:
                    propertyResults=[result+[term] for result in propertyResults]
                    properties=properties+propertyResults 
            mixedRelations.append("")
            mixedRelations[i]=relations+properties

            i=i+1

    questionRelationsNumber=len(mixedRelations)
    oldEnities=entities
    if (len(mixedRelations)==0 and questionWord.lower()=="when") or compare_found:
        mixedRelations.append([["date","http://dbpedia.org/ontology/date",0,20],["date","http://dbpedia.org/property/date",0,20]])
        compare_found=False

    for i in range(len(mixedRelations)):
        #print(i)
        mixedRelations[i]=distinct_relations(mixedRelations[i])


        mixedRelations[i],entities=reRank_relations(entities,mixedRelations[i],questionWord,questionRelationsNumber,question,k)


            
            
    mixedRelations=mix_list_items(mixedRelations,k)

    entities=mix_list_items_entities(entities,k)
    mixedRelations.extend(relations_improvement_country(entities))
    
    if nationalityFlag:
        mixedRelations.append(["country","http://dbpedia.org/ontology/country",20])
    
    if oneQuestion:
        #print("Relations:")
        #print(mixedRelations)
        #print("Entities:")
        #print(entities)
        return
    if(evaluation):
        numberSystemRelations=len(raw[2])
        intersection= set(raw[2]).intersection([tup[1] for tup in mixedRelations])
        if numberSystemRelations!=0 and len(mixedRelations)!=0:
            p_relation=len(intersection)/len(mixedRelations)
            r_relation=len(intersection)/numberSystemRelations
        for relation in raw[2]:
  
            if relation[relation.rfind('/')+1:] in [tup[1][tup[1].rfind('/')+1:] for tup in mixedRelations]:
                #p_relation=1/numberSystemRelations
                correctRelations=correctRelations+1
                #print(raw[0])
            else:
                
                wrongRelations=wrongRelations+1
                correct=False
                global questions_labels
 
                
  
        numberSystemEntities=len(raw[3])
        intersection= set(raw[3]).intersection([tup[1] for tup in entities])
        if numberSystemEntities!=0 and len(entities)!=0 :
            p_entity=len(intersection)/len(entities)
            r_entity=len(intersection)/numberSystemEntities
        for entity in raw[3]:
            if entity in [tup[1] for tup in entities]:
                
                correctEntities=correctEntities+1
            else:
                wrongEntities=wrongEntities+1
                correct=False
                print(raw[0])
                
      
                        
 
                
        #print(count)
        #print(p_entity)
        count=count+1
    endTime=time.time()
    raw.append(endTime-startTime)
    
    
    #print(mixedRelations)
    #print(entities)
    ############        
    #raw.append([[tup[1],tup[4]] for tup in mixedRelations])
    #raw.append([[tup[1],tup[4]] for tup in entities])
    #############
    #raw.append(p_relation)
    #raw.append(r_relation)
    #return raw
    relations_falcon = [tup[1] for tup in mixedRelations]
    entities_falcon = [tup[1] for tup in entities]
    return entities_falcon, relations_falcon
예제 #4
0
def evaluate(raw):
    evaluation = False
    startTime = time.time()
    oneQuestion = False
    global correctRelations
    #correctRelations=0
    global wrongRelations
    #wrongRelations=0
    global correctEntities
    #correctEntities=0
    global wrongEntities
    #wrongEntities=0
    global count
    count = 1
    p_entity = 0
    r_entity = 0
    p_relation = 0
    r_relation = 0
    k = 5
    correct = True
    questionRelationsNumber = 0
    entities = []
    questionWord = raw[0].strip().split(' ')[0]
    mixedRelations = []
    #beforeMixRelations=[]
    question = raw[0]
    #print(question)
    originalQuestion = question
    question = question[0].lower() + question[1:]
    question = question.replace("?", "")
    question = question.replace(".", "")
    question = question.replace("!", "")
    question = question.replace("\\", "")
    question = question.replace("#", "")

    questionStopWords = wiki_stopwords.extract_stop_words_question(
        question, stopWordsList)
    # print('questionStopWords: ', questionStopWords)
    combinations = get_question_combinatios(question, questionStopWords)
    # print('combinations: ',combinations)
    combinations = merge_comb_stop_words(combinations, question,
                                         questionStopWords)
    #print(combinations)

    combinations = split_base_on_verb(combinations, originalQuestion)
    combinations = split_base_on_s(combinations)
    oldCombinations = combinations

    for idx, term in enumerate(combinations):
        if len(term) == 0:
            continue
        if term[0].istitle():
            continue

        propertyResults = searchIndex.propertySearch(term)

        if len(propertyResults) == 0:
            combinations[idx] = term.capitalize()
            question = question.replace(term, term.capitalize())

    combinations = merge_comb_stop_words(combinations, question,
                                         questionStopWords)
    combinations = sort_combinations(combinations, question)
    combinations = merge_entity_prefix(question, combinations,
                                       originalQuestion)
    combinations, compare_found = split_bas_on_comparison(combinations)
    combinations = extract_abbreviation(combinations)
    i = 0
    nationalityFlag = False
    for term in combinations:
        properties = []
        entities_term = []
        if len(term) == 0:
            continue

        if (not word_is_verb(term, originalQuestion)) and (
                term[0].istitle() or len(term.split(' ')) > 2 or
            (any(x.isupper() for x in term))):
            # print(term," ", i)
            entityResults = wiki_search_elastic.entitySearch(term)
            if " and " in term:
                for word in term.split(' and '):
                    entityResults.extend(
                        wiki_search_elastic.entitySearch(word.strip()))
            if " or " in term:
                for word in term.split(' or '):
                    entityResults.extend(
                        wiki_search_elastic.entitySearch(word.strip()))
            if len(entityResults) != 0:
                for result in entityResults:
                    if result[1] not in [e[1] for e in entities_term]:
                        entities_term.append(result + [term])
                #print(len(entities_term))
                entities.append(entities_term)
                #print(entities)
        else:
            propertyResults = wiki_search_elastic.propertySearch(term)
            if len(propertyResults) != 0:
                propertyResults = [
                    result + [term] for result in propertyResults
                ]
                properties = properties + propertyResults
            mixedRelations.append("")
            mixedRelations[i] = properties
            i = i + 1

    questionRelationsNumber = len(mixedRelations)
    oldEnities = entities
    if (len(mixedRelations) == 0
            and questionWord.lower() == "when") or compare_found:
        mixedRelations.append(
            [["time", "http://www.wikidata.org/wiki/Property:P569", 0, 20]])
        compare_found = False

    for i in range(len(mixedRelations)):
        #print(i)
        mixedRelations[i] = distinct_relations(mixedRelations[i])
        mixedRelations[i], entities = reRank_relations(
            entities, mixedRelations[i], questionWord, questionRelationsNumber,
            question, k)

    mixedRelations = mix_list_items(mixedRelations, k)
    entities = mix_list_items_entities(entities, k)

    if nationalityFlag:
        mixedRelations.append(
            ["country", "https://www.wikidata.org/wiki/Property:P17", 20])

    if evaluation:
        prop = "<http://www.wikidata.org/wiki/Property:" + raw[2][0] + ">"
        #prop =raw[2]
        #numberSystemRelations=len(raw[1])
        numberSystemRelations = 1
        intersection = set(raw[2]).intersection(
            [tup[1][tup[1].rfind('/') + 1:-1] for tup in mixedRelations])
        if numberSystemRelations != 0 and len(mixedRelations) != 0:
            p_relation = len(intersection) / len(mixedRelations)
            r_relation = len(intersection) / numberSystemRelations

        if relation[relation.rfind('/') + 1:] in [
                tup[1][tup[1].rfind('/') + 1:] for tup in mixedRelations
        ]:
            correctRelations = correctRelations + 1

        else:
            wrongRelations = wrongRelations + 1
            correct = False
            global questions_labels

        true_entity = []
        for e in raw[1]:
            true_entity.append(e)
        #true_entity = raw[1]
        numberSystemEntities = len(raw[1])
        # print(true_entity, entities)
        intersection = set(true_entity).intersection(
            [tup[1][tup[1].rfind('/') + 1:-1] for tup in entities])

        true_entity = "<http://www.wikidata.org/entity/" + raw[0] + ">"
        numberSystemEntities = len(raw[0])

        if numberSystemEntities != 0 and len(entities) != 0:
            p_entity = len(intersection) / len(entities)
            r_entity = len(intersection) / numberSystemEntities
        for e in true_entity:
            if e in [tup[1][tup[1].rfind('/') + 1:-1] for tup in entities]:
                correctEntities = correctEntities + 1
            else:
                wrongEntities = wrongEntities + 1
                correct = False

        count = count + 1
    #endTime=time.time()
    #raw.append(endTime-startTime)

    ############
    raw.append([[tup[1], tup[4]] for tup in mixedRelations])
    raw.append([[tup[1], tup[4]] for tup in entities])
    #raw.append(p_entity)
    #raw.append(r_entity)
    #raw.append(p_relation)
    #raw.append(r_relation)
    return raw
예제 #5
0
def evaluate(raw, rules, evaluation=True):
    try:
        relations_flag = False
        global correctRelations
        #correctRelations=0
        global wrongRelations
        #wrongRelations=0
        global correctEntities
        #correctEntities=0
        global wrongEntities
        #wrongEntities=0
        global count
        print(count)
        p_entity = 0
        r_entity = 0
        p_relation = 0
        r_relation = 0
        k = 1
        questionRelationsNumber = 0
        entities = []
        questionWord = raw[0].strip().split(' ')[
            0]  # Fetch the query head word

        mixedRelations = []
        question = raw[0]
        if question.strip()[-1] != "?":
            question = question + "?"
        originalQuestion = question
        question = question[0].lower() + question[1:]
        question = question.replace("?", "")
        question = question.replace(".", "")
        question = question.replace("!", "")
        question = question.replace("\\", "")
        question = question.replace("#", "")

        questionStopWords = []
        combinations = question.split(' ')
        combinations_relations = []
        """ Falcon 2.0 pipeline is implemented as a forward chain of a carefully curated list of rules based on 
            fundamental principles of the English morphology. The user is allowed to choose a set of rules to process the query.
            The "rules" list variable enlists the rules chosen by the user. 
            Based on this set of rules, the Falcon 2.0 pipeline processes the input query.
        """
        if any(x == 1 for x in rules):
            questionStopWords = extract_stop_words_question(
                question)  #rule1: Stopwords cannot be entities or relations
        if any(x == 2 for x in rules):
            combinations = get_question_combinatios(
                question, questionStopWords
            )  #rule 2: If two or more words do not have any stopword in between, consider them as a single compound word

        if any(x == 4 for x in rules):
            combinations, combinations_relations = split_base_on_verb(
                combinations, combinations_relations, originalQuestion
            )  #rule 4: Verbs cannot be an entity, Verbs act as a division point of the sentence in case of two entities and we do not merge tokens from either side of the verb.
            combinations = split_base_on_s(combinations)

        if any(x == 3 for x in rules):
            combinations, combinations_relations = merge_comb_stop_words(
                combinations, combinations_relations, question,
                questionStopWords
            )  #rule 3: Entities with only stopwords between them are one entity

        if any(x == 5 for x in rules):
            for idx, term in enumerate(
                    combinations
            ):  #rule 5: If a token does not have any relation candidate, identify it as an entity
                if len(term) == 0:
                    continue
                if term[0].istitle():
                    continue

                propertyResults = wiki_search_elastic.propertySearch(term)

                if len(propertyResults) == 0:
                    combinations[idx] = term.capitalize()
                    question = question.replace(term, term.capitalize())

            if any(x == 3 for x in rules):
                combinations = sort_combinations(combinations, question)

        if any(x == 8 for x in rules):
            combinations, compare_found = split_bas_on_comparison(
                combinations
            )  #rule 8: Comparison words acts as a point of division in case of two tokens/entities

        if any(x == 9 for x in rules):
            combinations = extract_abbreviation(
                combinations)  #rule 9: Abbreviations are separate entities

        if any(x == 10 for x in rules):
            combinations, combinations_relations = split_base_on_entities(
                combinations, combinations_relations, originalQuestion
            )  #rule 10: Split the surface form if it's already recognized as a Person

        if any(x == 14 for x in rules):
            combinations, combinations_relations = get_relations_seachindex(
                combinations, combinations_relations)  #rule 14

        combinations = upper_all_entities(combinations, originalQuestion)

        if any(x == 12 for x in rules):
            combinations = merge_comb_det(
                combinations, originalQuestion
            )  #rule 12: Merge the determiner in the combination, if preceding an entity

        #Rules applied during/after elastic search
        i = 0
        nationalityFlag = False
        for term in combinations:
            entities_term = []
            if len(term) == 0:
                continue

            if check_entities_in_text(originalQuestion, term):
                term = term.capitalize()

            entityResults = wiki_search_elastic.entitySearch(term)
            if " and " in term:
                for word in term.split(' and '):
                    entityResults.extend(
                        wiki_search_elastic.entitySearch(word.strip()))
            if " or " in term:
                for word in term.split(' or '):
                    entityResults.extend(
                        wiki_search_elastic.entitySearch(word.strip()))
            if len(entityResults) != 0:
                for result in entityResults:
                    if result[1] not in [e[1] for e in entities_term]:
                        entities_term.append(result + [term])
                entities.append(entities_term)

        for term in combinations_relations:
            properties = []
            propertyResults = wiki_search_elastic.propertySearch(term)
            if len(propertyResults) != 0:
                propertyResults = [
                    result + [term] for result in propertyResults
                ]
                properties = properties + propertyResults
            mixedRelations.append("")
            mixedRelations[i] = properties
            i = i + 1

        questionRelationsNumber = len(mixedRelations)

        if (len(mixedRelations) == 0 and questionWord.lower() == "when"):
            mixedRelations.append([[
                "time", "<http://www.wikidata.org/wiki/Property:P569>", 0, 20,
                "when"
            ]])

        for i in range(len(mixedRelations)):
            #print(i)
            mixedRelations[i] = distinct_relations(mixedRelations[i])
            try:
                if any(
                        x == 13 for x in rules
                ):  #rule13: If the text is a question, use the question word to increase the weight of all the relations which range matches the question word expected answer.
                    head_rule = True
                else:
                    head_rule = False
                mixedRelations[i], entities = reRank_relations(
                    entities, mixedRelations[i], questionWord,
                    questionRelationsNumber, question, k, head_rule)
            except:
                try:
                    mixedRelations[i], entities = reRank_relations(
                        entities, mixedRelations[i], questionWord,
                        questionRelationsNumber, question, k, head_rule)
                except:
                    continue

        mixedRelations = mix_list_items(mixedRelations, k)
        entities = mix_list_items_entities(entities, k)

        if nationalityFlag:
            mixedRelations.append([
                "country", "<https://www.wikidata.org/wiki/Property:P17>", 20,
                "country"
            ])

        # If the evaluation flag is set to True, run the Falcon 2.0 pipeline on datasets
        if evaluation:
            if relations_flag:
                numberSystemRelations = len(raw[2])
                intersection = set(raw[2]).intersection([
                    tup[1][tup[1].rfind('/') + 1:-1] for tup in mixedRelations
                ])
                if numberSystemRelations != 0 and len(mixedRelations) != 0:
                    p_relation = len(intersection) / len(mixedRelations)
                    r_relation = len(intersection) / numberSystemRelations

            true_entity = []
            for e in raw[1]:
                true_entity.append(e)
            numberSystemEntities = len(raw[1])
            intersection = set(true_entity).intersection(
                [tup[1][tup[1].rfind('/') + 1:-1] for tup in entities])

            if numberSystemEntities != 0 and len(entities) != 0:
                p_entity = len(intersection) / len(entities)
                r_entity = len(intersection) / numberSystemEntities
            for e in true_entity:
                if e in [tup[1][tup[1].rfind('/') + 1:-1] for tup in entities]:
                    correctEntities = correctEntities + 1
                else:
                    wrongEntities = wrongEntities + 1

        count = count + 1
        ############
        raw.append([[tup[1], tup[4]] for tup in mixedRelations])
        raw.append([[tup[1], tup[4]] for tup in entities])
        raw.append(p_entity)
        raw.append(r_entity)
        raw.append(p_relation)
        raw.append(r_relation)
        global threading
        if threading == True:
            global results
            results.append(raw)

        return raw
    except:
        #raise
        print("error")
예제 #6
0
 combinations=merge_comb_stop_words(combinations,question,questionStopWords)
 combinations=sort_combinations(combinations,question)
 combinations=merge_entity_prefix(question,combinations,originalQuestion)
 combinations,compare_found=split_bas_on_comparison(combinations)
 combinations=extract_abbreviation(combinations)
 i=0
 nationalityFlag=False
 for term in combinations:
     properties=[]
     entities_term=[]
     if len(term)==0:
         continue
     
     if (not word_is_verb(term,originalQuestion)) and (term[0].istitle() or len(term.split(' ')) > 2   or (any(x.isupper() for x in term))) :
         # print(term," ", i)
         entityResults=wiki_search_elastic.entitySearch(term)
         if " and " in term:
             for word in term.split(' and '):
                 entityResults.extend(wiki_search_elastic.entitySearch(word.strip()))
         if " or " in term:
             for word in term.split(' or '):
                 entityResults.extend(wiki_search_elastic.entitySearch(word.strip()))
         if len(entityResults)!=0:
             for result in entityResults:
                 if result[1] not in [e[1] for e in entities_term]:
                     entities_term.append(result+[term])
             #print(len(entities_term))
             entities.append(entities_term)
                 #print(entities)
     else:
         propertyResults=wiki_search_elastic.propertySearch(term)