コード例 #1
0
def calculate_relevance_scores(query: dict, json_contents: dict, ef: EntityInfo):
    scored_documents = {}

    for index, reply in enumerate(json_contents['replies']):
        if index == 0:
            continue

        try:
            is_same_category = query['category'] == json_contents['commonCategory']
            is_same_thread = query['threadId'] == json_contents['threadId']
            number_votes = reply['postThankYouCount'] + reply['postHelpfulCount'] + reply['postSupportCount']
            text_length = len(reply['postText'])
            is_doctor_reply = reply['mdReply']

            number_medical_entities = 0
            number_same_entities = 0

            if 'annotationsFull' in reply:
                annotations = set(get_entity_code(entity) for entity in reply['annotationsFull'])

                for entity in annotations:
                    if entity in ef.informative_entities:
                        number_medical_entities += 1
                        if entity in query['annotations']:
                            number_same_entities += 1
                    elif entity not in ef.other_entities and ef.is_informative_entity(entity):
                        number_medical_entities += 1
                        ef.update_informative_list(entity)

                        if entity in query['annotations']:
                            number_same_entities += 1

                    elif entity not in ef.other_entities:
                        ef.update_other_list(entity)

            document_score = scoring_function(is_doctor_reply, number_votes, number_medical_entities,
                                              number_same_entities, text_length, is_same_category, is_same_thread)

            document_name = "EF" + str(json_contents['threadId']) + "r" + str(index)

            scored_documents[document_name] = document_score

        except KeyError as ke:
            if 'threadId' in json_contents:
                print("KeyError in file: " + json_contents['threadId'])
            else:
                print("KeyError in unknown file.")

            return None

    return scored_documents
コード例 #2
0
def extract_query(json_contents: dict, ef: EntityInfo):
    if json_contents['replyCount'] < 2:
        return None

    query = {'category': json_contents['commonCategory']}

    annotations = []
    if 'annotationsFull' in json_contents['replies'][0]:
        for annotation in json_contents['replies'][0]['annotationsFull']:
            entity = get_entity_code(annotation)

            if ef.is_informative_entity(entity):
                annotations.append(entity)

        query['annotations'] = annotations
    else:
        query['annotations'] = None

    post_text = json_contents['replies'][0]['postText']
    length = len(post_text)
    query['length'] = length
    query['text'] = post_text.replace('\t', '')
    query['username'] = json_contents['replies'][0]['createdBy']['username']

    query['threadId'] = json_contents['threadId']
    if 'annotatedOriginCategory' in json_contents:
        query['annotatedOriginCategory'] = json_contents['annotatedOriginCategory']


    return query
コード例 #3
0
def make_annotation_types(annotations: list, ef: EntityInfo) -> list:
    """
    Extract information the type of annotation for a given entity code (e.g. C124894)

    @param annotations: List of annotations to get information about.
    @param ef: EntityInfo class, used to extract information about informative entities.
    @return: Number of each entity type (uses the list of informative entity types defined in
    relevanceRanking.connect_to_kb)
    """
    types_counts = {}

    for entity in informative_entity_types:
        types_counts[entity] = 0

    if annotations is not None:
        for annotation in annotations:
            try:
                types = ef.get_entity_types(annotation)

            except ValueError as ve:
                print("Value could not be found: " + str(annotation))
                continue

            for entity_type in types:
                types_counts[entity_type] += 1

    output_counts = []
    for entity in informative_entity_types:
        output_counts.append(types_counts[entity])

    return output_counts
コード例 #4
0
def produce_training_data(scores: dict, queries: dict, documents: dict,
                          ef: EntityInfo) -> (list, list):
    training_data = []
    target_values = []

    for query_id in scores:
        for document_id in scores[query_id]:

            if document_id in documents and query_id in queries:
                document = documents[document_id]
                query = queries[query_id]
            else:
                continue

            is_same_category = int(query['category'] == document['category'])
            is_same_thread = int(query['threadId'] == document['threadId'])
            number_votes_t = document['votes-t']
            number_votes_s = document['votes-s']
            number_votes_h = document['votes-h']
            is_doctor_reply = int(document['mdReply'])

            text_length_factor = document['postLength'] // 150

            number_medical_entities = 0
            number_same_entities = 0

            annotations = set(document['annotations'])

            for entity in annotations:
                try:
                    if ef.is_informative_entity(entity):
                        number_medical_entities += 1

                        if entity in query['annotations']:
                            number_same_entities += 1

                except ValueError as ve:
                    print("The entity " + entity +
                          " could not be evaluated. Skipping...")
                    continue

            training_item = [
                is_doctor_reply, number_votes_h, number_votes_s,
                number_votes_t, number_medical_entities, number_same_entities,
                text_length_factor, is_same_category, is_same_thread
            ]
            target_item = [scores[query_id][document_id]]

            training_data.append(training_item)
            target_values.append(target_item)

    return training_data, target_values
コード例 #5
0
def main():
    ef = EntityInfo()

    bm25_scores = read_score_file(starting_file)

    full_queries = make_queries(list(bm25_scores.keys()), ef)

    document_ids = set()
    for query in bm25_scores:
        for doc_id in bm25_scores[query].keys():
            document_ids.add(doc_id)

    full_documents = find_documents(document_ids)
    training_data, target_values = produce_training_data(
        bm25_scores, full_queries, full_documents, ef)
    write_out_training_data(output_directory, training_data, target_values)
コード例 #6
0
def main():
    entity_info = EntityInfo()

    queries = make_queries(starting_directory, entity_info)
    with open(os.path.join(output_directory, "queries.json"), "w+", encoding="utf8") as output_file:
        json_queries = []
        for query in queries:
            json_query = query
            json_query['annotations'] = list(query['annotations'])

            json_queries.append(json_query)

        json.dump(json_queries, output_file)

    print("Wrote queries to file.")

    loop_all_documents(queries, starting_directory, entity_info)
コード例 #7
0
for result in contents['hits']['hits']:
    titles.append(result['_source']['Title'])

print(len(titles))

# print(json.dumps(titles))

mayo_namemap = {}
with open("d:/onedrive/documents/ab germany/health_data/mayo_map.txt", "r", encoding="utf8", errors='ignore') as file:
    for line in file:
        category, mayo_name = line.split(";")
        category = category.lower()
        mayo_name = mayo_name.replace('\n', '').lower()
        mayo_namemap[category] = mayo_name

ef = EntityInfo()

# print(contents['hits']['hits'][1]['_source']['aida']['annotatedText'])

entities_by_category = {}
print("Number of titles: " + str(len(mayo_namemap)))

counter = 0

print('Values:')
print(list(mayo_namemap.values()))


for result in contents['hits']['hits']:
    print(result['_source']['Title'])
    if result['_source']['Title'].find('Abdominal') > 0:
コード例 #8
0
ファイル: trec_maker_noent.py プロジェクト: zafodB/HealthData
                      encoding="utf8")

if WRITE_DOCUMENTS:
    data_file = open(os.path.join(output_directory_data, "data0.trac"),
                     "w+",
                     encoding="utf8")

processed_files = 0
error_files = 0
documents_written = 0
queries_written = 0

pattern = re.compile('C[0-9]{3,}')
pattern_long = re.compile("\[\[C[0-9]+\|[\w\s]{1,}\]\]")

ef = EntityInfo()

# Walk through the input JSON data directory.
for root, dirs, files in os.walk(starting_directory):
    for file_name in files:
        try:
            with open(os.path.join(root, file_name), "r",
                      encoding="utf8") as file:
                content = json.loads(file.read())

            # Write out progress every bunch of files.
            processed_files += 1
            if processed_files % 100 == 0:
                print("Processed files: " + str(processed_files))

            if WRITE_QUERIES:
コード例 #9
0
'''
 * Created by filip on 08/11/2019
'''

from prepareTrainingData.EntityInfo.entities_info import EntityInfo
from prepareTrainingData.EntityInfo.connect_to_kb import get_entity_types

informative_entities = set()

with open("/home/fadamik/Documents/informative_nodes.txt",
          "r",
          encoding="utf8") as file:
    for line in file:
        informative_entities.add(line.replace("\n", ""))

ei = EntityInfo()

for entity in informative_entities:
    types = get_entity_types(entity, ei.get_es())
    ei.update_entity_types(entity, types)

ei.write_out_entity_types()
コード例 #10
0

processed_files = 0
error_files = 0
documents_written = 0
queries_written = 0

data_file = open(os.path.join(output_directory_data, "data1.trac"), "w+", encoding="utf8")
maps_file_2 = open(os.path.join(output_directory_maps, "topic-map2.txt"), "w+", encoding="utf8")
maps_file_3 = open(os.path.join(output_directory_maps, "topic-map3.txt"), "w+", encoding="utf8")
query_file = open(os.path.join(output_directory_queries, "queries0.txt"), "w+", encoding="utf8")

pattern = re.compile('C[0-9]{3,}')
pattern_long = re.compile("\[\[C[0-9]+\|[\w\s]{1,}\]\]")

ef = EntityInfo()

for root, dirs, files in os.walk(starting_directory):
    for file_name in files:
        try:
            file = open(os.path.join(root, file_name), "r", encoding="utf8")
            content = json.loads(file.read())
            file.close()

            query = content['replies'][0]
            topic_no = str(content['threadId'])

            entities_string = ""

            if 'annotationsFull' in query:
                for annotation in query['annotationsFull']:
コード例 #11
0
def produce_training_data(selected: list, queries: dict, documents: dict,
                          ef: EntityInfo) -> list:
    """
    Create training data by reading the full query, extracting document features in respect to the query and reading

    @param selected: List of selected queries
    @param queries: Full details of the selected queries
    @param documents: Relevant details of the selected documents.
    @param ef: EntityInfo class, used to extract information about informative entities.
    @return: List of individual training pairs with all details necessary for Snorkel processing.
    """
    training_data = []

    entity_list = ef.get_entity_relations()

    for query_id in selected:
        if query_id in queries:
            query = queries[query_id]
        else:
            continue

        if 'annotatedOriginCategory' in query:
            category_annotation = get_entity_code(
                query['annotatedOriginCategory'])
            if category_annotation is not None and query[
                    'annotations'] is not None:
                query['annotations'].append(category_annotation)
            elif category_annotation is not None and query[
                    'annotations'] is None:
                query['annotations'] = [category_annotation]

        for document_id in selected[query_id]:
            if document_id in documents:
                document = documents[document_id]
            else:
                break

            query_category = query['category']
            query_thread = query['threadId']
            query_text = query['text'].replace('\t', '')
            query_username = query['username']
            query_annotation_count = ""
            for index, count in enumerate(
                    make_annotation_types(query['annotations'], ef)):
                if index > 0:
                    query_annotation_count = query_annotation_count + '\t'

                query_annotation_count = query_annotation_count + str(count)

            document_annotations = ''
            for index, annotation in enumerate(document['annotations']):
                if index > 0:
                    document_annotations += ";"
                document_annotations += annotation

            if query['annotations'] is not None:
                query_annotations = ''
                relationships = ''
                for index, annotation in enumerate(query['annotations']):
                    if index > 0:
                        query_annotations += ";"
                    query_annotations += annotation

                    if annotation in entity_list:
                        for document_annotation in document['annotations']:
                            if document_annotation in entity_list[annotation]:
                                if not relationships == '':
                                    relationships += ','
                                relationships += entity_list[annotation][
                                    document_annotation]

            else:
                query_annotations = None
                relationships = None

            document_category = document['category']
            document_thread = document['threadId']

            document_user_status = document['userStatus']
            document_username = document['username']
            document_number_votes_t = document['votes-t']
            document_number_votes_s = document['votes-s']
            document_number_votes_h = document['votes-h']
            document_text = document['document-text'].replace('\t',
                                                              '').replace(
                                                                  '\n', '')
            document_is_doctor_reply = document['mdReply']

            document_annotation_count = ""
            for index, count in enumerate(
                    make_annotation_types(document['annotations'], ef)):
                if index > 0:
                    document_annotation_count = document_annotation_count + '\t'

                document_annotation_count = document_annotation_count + str(
                    count)

            # Two 0s at the end represent a BM25 score (non-relevant queries don't have it) and BM25 relevance (0 for
            # non-relevant)
            training_item = [
                query_category, query_thread, query_id, query_text,
                query_username, query_annotations, query_annotation_count,
                document_category, document_id, document_thread, document_text,
                document_is_doctor_reply, document_number_votes_h,
                document_number_votes_s, document_number_votes_t,
                document_username, document_user_status, document_annotations,
                document_annotation_count, relationships, '0', '0'
            ]

            training_data.append(training_item)

    return training_data