Ejemplo n.º 1
0
def _get_version() -> str:
    """Get the version of the current data."""
    zip_path = ensure_path(PREFIX, url=URL)
    with zipfile.ZipFile(zip_path) as zip_file:
        for x in zip_file.filelist:
            if x.filename.endswith('.sqlite'):
                return x.filename[len('itisSqlite'):-len('/ITIS.sqlite')]
    raise ValueError('could not find a file with the version in it')
Ejemplo n.º 2
0
def get_path(version: str):
    """Get the path to the extracted ChEMBL SQLite database."""
    url = f'ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/chembl_{version}/chembl_{version}_sqlite.tar.gz'
    path = ensure_path(PREFIX, url=url, version=version)
    name = f'chembl_{version}/chembl_{version}_sqlite/chembl_{version}.db'
    d = get_prefix_directory(PREFIX, version=version)
    op = os.path.join(d, name)
    if not os.path.exists(op):
        with tarfile.open(path, mode='r', encoding='utf-8') as tar_file:
            tar_file.extractall(d)
    return op
Ejemplo n.º 3
0
def iter_terms() -> Iterable[Term]:
    """Get ITIS terms."""
    zip_path = ensure_path(PREFIX, url=URL)
    version = _get_version()
    sqlite_dir = get_prefix_directory(PREFIX, version=version)
    sqlite_path = prefix_directory_join(PREFIX, 'ITIS.sqlite', version=version)
    if not os.path.exists(sqlite_path):
        with zipfile.ZipFile(zip_path) as zip_file:
            for x in zip_file.filelist:
                if x.filename.endswith('.sqlite'):
                    zip_file.extract(x, sqlite_dir)
                    shutil.move(os.path.join(sqlite_dir, f'itisSqlite{version}', 'ITIS.sqlite'), sqlite_path)
                    os.rmdir(os.path.join(sqlite_dir, f'itisSqlite{version}'))

    if not os.path.exists(sqlite_path):
        raise FileNotFoundError(f'file missing: {sqlite_path}')

    conn = sqlite3.connect(sqlite_path)

    with closing(conn.cursor()) as cursor:
        cursor.execute(LONGNAMES_QUERY)
        id_to_reference = {
            str(identifier): Reference(prefix=PREFIX, identifier=str(identifier), name=name)
            for identifier, name in cursor.fetchall()
        }

    with closing(conn.cursor()) as cursor:
        cursor.execute(HIERARCHY_QUERY)
        id_to_parents = multidict(
            (str(child), str(parent))
            for child, parent in cursor.fetchall()
        )

    for identifier, reference in id_to_reference.items():
        parents = []
        for parent_identifier in id_to_parents.get(identifier, []):
            if parent_identifier == '0':  # this means it's a plant
                continue
            parents.append(id_to_reference[parent_identifier])
        term = Term(
            reference=reference,
            parents=parents,
        )
        yield term
Ejemplo n.º 4
0
def iter_terms() -> Iterable[Term]:
    """Get ITIS terms."""
    zip_path = ensure_path(PREFIX, URL)
    sqlite_path = prefix_directory_join(PREFIX, 'itisSqlite043020',
                                        'ITIS.sqlite')
    if not os.path.exists(sqlite_path):
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(get_prefix_directory(PREFIX))

    if not os.path.exists(sqlite_path):
        raise FileNotFoundError(f'file missing: {sqlite_path}')

    conn = sqlite3.connect(sqlite_path)

    with closing(conn.cursor()) as cursor:
        cursor.execute(LONGNAMES_QUERY)
        id_to_reference = {
            str(identifier): Reference(prefix=PREFIX,
                                       identifier=str(identifier),
                                       name=name)
            for identifier, name in cursor.fetchall()
        }

    with closing(conn.cursor()) as cursor:
        cursor.execute(HIERARCHY_QUERY)
        id_to_parents = multidict(
            (str(child), str(parent)) for child, parent in cursor.fetchall())

    for identifier, reference in id_to_reference.items():
        parents = []
        for parent_identifier in id_to_parents.get(identifier, []):
            if parent_identifier == '0':  # this means its a plant
                continue
            parents.append(id_to_reference[parent_identifier])
        term = Term(
            reference=reference,
            parents=parents,
        )
        yield term
Ejemplo n.º 5
0
def iter_terms(version: str, autodownload: bool = False) -> Iterable[Term]:
    """Iterate over UMLS terms."""
    name = f'umls-{version}-mrconso.zip'
    url = f'https://download.nlm.nih.gov/umls/kss/{version}/{name}'
    if autodownload:
        # FIXME needs automated scrapy step where you put in user/password
        path = ensure_path(PREFIX, url=url, version=version)
    else:
        path = RAW_MODULE.get(PREFIX, version, name)
        if not path.exists():
            raise FileNotFoundError(
                f'UMLS needs to be downloaded manually still and moved to  {path}. '
                f'See https://www.nlm.nih.gov/research/umls/index.html', )

    with zipfile.ZipFile(path) as zip_file:
        with zip_file.open('MRCONSO.RRF', mode='r') as file:
            it = tqdm(file, unit_scale=True, desc='[umls] parsing')
            lines = (line.decode('utf-8').strip().split('|') for line in it)
            for cui, cui_lines in itt.groupby(lines,
                                              key=operator.itemgetter(0)):
                df = pd.DataFrame(list(cui_lines), columns=RRF_COLUMNS)
                df = df[df['LAT - Language'] == 'ENG']
                idx = ((df['ISPREF - is preferred'] == 'Y')
                       & (df['TS - Term Status'] == 'P')
                       & (df['STT - String Type'] == 'PF'), )
                pref_rows_df = df.loc[idx]
                if len(pref_rows_df.index) != 1:
                    it.write(
                        f'no preferred term for umls:{cui}. got {len(pref_rows_df.index)}'
                    )
                    continue

                df['TTY - Term Type in Source'] = df[
                    'TTY - Term Type in Source'].map(synonym_abb.__getitem__)

                _r = pref_rows_df.iloc[0]
                sdf = df[[
                    'SAB - source name', 'CODE', 'TTY - Term Type in Source',
                    'STR'
                ]]

                synonyms = []
                xrefs = []
                for source, identifier, synonym_type, synonym in sdf.values:
                    norm_source = normalize_prefix(source)
                    if norm_source is None or not identifier:
                        provenance = []
                    else:
                        ref = Reference(prefix=norm_source,
                                        identifier=identifier)
                        provenance = [ref]
                        xrefs.append(ref)
                    synonyms.append(
                        Synonym(
                            name=synonym,
                            provenance=provenance,
                            type=SynonymTypeDef.from_text(synonym_type),
                        ))

                xrefs = sorted(set(xrefs),
                               key=lambda reference:
                               (reference.prefix, reference.identifier))

                term = Term(
                    reference=Reference(prefix=PREFIX,
                                        identifier=cui,
                                        name=_r['STR']),
                    synonyms=synonyms,
                    xrefs=xrefs,
                )
                yield term