Beispiel #1
0
def extract_from_mrconso(
        mrconso_path, mrsty_path, opts,
        mrconso_header=HEADERS_MRCONSO, mrsty_header=HEADERS_MRSTY):

    start = time.time()
    print('loading semantic types...', end=' ')
    sys.stdout.flush()
    sem_types = get_semantic_types(mrsty_path, mrsty_header)
    print('done in {:.2f} s'.format(time.time() - start))

    start = time.time()

    mrconso_iterator = get_mrconso_iterator(
        mrconso_path, mrconso_header, opts.language
    )

    total = countlines(mrconso_path)

    processed = set()
    i = 0

    for content in mrconso_iterator:
        i += 1

        if i % 100000 == 0:
            delta = time.time() - start
            status = (
                '{:,} in {:.2f} s ({:.2%}, {:.1e} s / term)'
                ''.format(i, delta, i / total, delta / i if i > 0 else 0)
            )
            print(status)

        concept_text = content['str'].strip()
        cui = content['cui']
        preferred = 1 if content['ispref'] else 0

        if opts.lowercase:
            concept_text = concept_text.lower()

        if opts.normalize_unicode:
            concept_text = unidecode(concept_text)

        if (cui, concept_text) in processed:
            continue
        else:
            processed.add((cui, concept_text))

        yield (concept_text, cui, sem_types[cui], preferred)

    delta = time.time() - start
    status = (
        '\nCOMPLETED: {:,} in {:.2f} s ({:.1e} s / term)'
        ''.format(i, delta, i / total, delta / i if i > 0 else 0)
    )
    print(status)
Beispiel #2
0
def extract_from_mrconso(
        mrconso_path, mrsty_path, opts,
        mrconso_header=HEADERS_MRCONSO, mrsty_header=HEADERS_MRSTY):

    start = time.time()
    print('loading semantic types...', end=' ')
    sys.stdout.flush()
    sem_types = get_semantic_types(mrsty_path, mrsty_header)
    print('done in {:.2f} s'.format(time.time() - start))

    start = time.time()

    mrconso_iterator = get_mrconso_iterator(
        mrconso_path, mrconso_header, opts.language
    )

    total = countlines(mrconso_path)

    processed = set()
    i = 0

    for content in mrconso_iterator:
        i += 1

        if i % 100000 == 0:
            delta = time.time() - start
            status = (
                '{:,} in {:.2f} s ({:.2%}, {:.1e} s / term)'
                ''.format(i, delta, i / total, delta / i if i > 0 else 0)
            )
            print(status)

        concept_text = content['str'].strip()
        cui = content['cui']
        preferred = 1 if content['ispref'] else 0

        if opts.lowercase:
            concept_text = concept_text.lower()

        if opts.normalize_unicode:
            concept_text = unidecode(concept_text)

        if (cui, concept_text) in processed:
            continue
        else:
            processed.add((cui, concept_text))

        yield (concept_text, cui, sem_types[cui], preferred)

    delta = time.time() - start
    status = (
        '\nCOMPLETED: {:,} in {:.2f} s ({:.1e} s / term)'
        ''.format(i, delta, i / total, delta / i if i > 0 else 0)
    )
    print(status)
Beispiel #3
0
def extract_from_mrconso(
        mrconso_path, mrsty_path, opts,
        mrconso_header=HEADERS_MRCONSO, mrsty_header=HEADERS_MRSTY):

    start = time.time()
    print('loading semantic types...', end=' ')
    sys.stdout.flush()
    sem_types = get_semantic_types(mrsty_path, mrsty_header)
    print('done in {:.2f} s'.format(time.time() - start))

    start = time.time()

    mrconso_iterator = get_mrconso_iterator(
        mrconso_path, mrconso_header, opts.language
    )

    total = countlines(mrconso_path)

    processed = set()
    yielded = set()
    put_aside = set()
    code_concepts = 0
    i = 0
    added_from_suppressed = 0

    for content in mrconso_iterator:
        i += 1

        if i % 100000 == 0:
            delta = time.time() - start
            status = (
                '{:,} in {:.2f} s ({:.2%}, {:.2f} ms / term)'
                ''.format(i, delta, i / total, delta / i * 1000 if i > 0 else 0)
            )
            print(status)

        concept_text = content['str'].strip()
        cui = content['cui']
        code = content['code']
        preferred = 1 if content['ispref'] == 'Y' else 0
        suppressed = content['suppress']

        if opts.lowercase:
            concept_text = concept_text.lower()

        if opts.normalize_unicode:
            concept_text = unidecode(concept_text)

        code_concept_yielded = (code, concept_text) in processed
        if code_concept_yielded:
            continue

        # check that code is not obsolete before adding it to iterator
        # if code is obsolete, check whether it's been yielded earlier as active
        # if it was, then term can be added to the database
        # if not, then store it till the end, recheck then and yield if ok

        code_yielded = code in yielded
        if suppressed != 'N':
            if not code_yielded:
                #print((concept_text, code, cui, sem_types[cui], preferred))
                put_aside.add((concept_text, code, cui, tuple(sem_types[cui]), preferred))
                continue
            else:
                added_from_suppressed += 1
        processed.add((code, concept_text))
        code_concepts += 1
        if not code_yielded:
            yielded.add(code)
        yield (concept_text, code, cui, sem_types[cui], preferred)

    # Now check the put aside codes if they were yielded as active
    # If yes, yield them
    print("Concepts put aside: " + str(len(put_aside)))
    rejected = 0
    with open('codes_rejected.txt', 'w') as rejected_file:
        with open('codes_added.txt', 'w') as added_file:
            for (concept_text, code, cui, semtypes, preferred) in put_aside:
                if code in yielded and (code, concept_text) not in processed:
                    processed.add((code, concept_text))
                    code_concepts += 1
                    yield (concept_text, code, cui, list(semtypes), preferred)
                    added_from_suppressed += 1
                    added_file.write(str((concept_text, code, cui, preferred)) +"\n")
                    continue
                rejected += 1
                rejected_file.write(str((concept_text, code, cui, preferred)) +"\n")
    print("Terms added from suppressed: " + str(added_from_suppressed))
    print("Terms rejected from put aside: " + str(rejected))
    print("Total number of code_concept combinations added to db: " + str(code_concepts))

    delta = time.time() - start
    status = (
        '\nCOMPLETED: {:,} in {:,} s ({:.2f} ms / term)'
        ''.format(i, delta, delta / i * 1000 if i > 0 else 0)
    )
    print(status)