Ejemplo n.º 1
0
def upload_artifacts_for_prefix(*, prefix: str, bucket: str):
    """Upload compiled parts for the given prefix to AWS."""
    logger.info('[%s] getting id->name mapping', prefix)
    get_id_name_mapping(prefix)
    id_name_path = prefix_directory_join(prefix, 'cache', 'names.tsv')
    id_name_key = os.path.join(prefix, 'cache', 'names.tsv')
    logger.info('[%s] uploading id->name mapping', prefix)
    upload_file(path=id_name_path, bucket=bucket, key=id_name_key)

    logger.info('[%s] getting id->synonyms mapping', prefix)
    get_id_synonyms_mapping(prefix)
    id_synonyms_path = prefix_directory_join(prefix, 'cache', 'synonyms.tsv')
    id_synonyms_key = os.path.join(prefix, 'cache', 'synonyms.tsv')
    logger.info('[%s] uploading id->synonyms mapping', prefix)
    upload_file(path=id_synonyms_path, bucket=bucket, key=id_synonyms_key)

    logger.info('[%s] getting xrefs', prefix)
    get_xrefs_df(prefix)
    xrefs_path = prefix_directory_join(prefix, 'cache', 'xrefs.tsv')
    xrefs_key = os.path.join(prefix, 'cache', 'xrefs.tsv')
    logger.info('[%s] uploading xrefs', prefix)
    upload_file(path=xrefs_path, bucket=bucket, key=xrefs_key)

    logger.info('[%s] getting relations', prefix)
    get_relations_df(prefix)
    relations_path = prefix_directory_join(prefix, 'cache', 'relations.tsv')
    relations_key = os.path.join(prefix, 'cache', 'relations.tsv')
    logger.info('[%s] uploading relations', prefix)
    upload_file(path=relations_path, bucket=bucket, key=relations_key)

    logger.info('[%s] getting properties', prefix)
    get_properties_df(prefix)
    properties_path = prefix_directory_join(prefix, 'cache', 'properties.tsv')
    properties_key = os.path.join(prefix, 'cache', 'properties.tsv')
    logger.info('[%s] uploading properties', prefix)
    upload_file(path=properties_path, bucket=bucket, key=properties_key)
Ejemplo n.º 2
0
def iter_terms() -> Iterable[Term]:
    """Get ITIS terms."""
    zip_path = ensure_path(PREFIX, url=URL)
    version = _get_version()
    sqlite_dir = get_prefix_directory(PREFIX, version=version)
    sqlite_path = prefix_directory_join(PREFIX, 'ITIS.sqlite', version=version)
    if not os.path.exists(sqlite_path):
        with zipfile.ZipFile(zip_path) as zip_file:
            for x in zip_file.filelist:
                if x.filename.endswith('.sqlite'):
                    zip_file.extract(x, sqlite_dir)
                    shutil.move(os.path.join(sqlite_dir, f'itisSqlite{version}', 'ITIS.sqlite'), sqlite_path)
                    os.rmdir(os.path.join(sqlite_dir, f'itisSqlite{version}'))

    if not os.path.exists(sqlite_path):
        raise FileNotFoundError(f'file missing: {sqlite_path}')

    conn = sqlite3.connect(sqlite_path)

    with closing(conn.cursor()) as cursor:
        cursor.execute(LONGNAMES_QUERY)
        id_to_reference = {
            str(identifier): Reference(prefix=PREFIX, identifier=str(identifier), name=name)
            for identifier, name in cursor.fetchall()
        }

    with closing(conn.cursor()) as cursor:
        cursor.execute(HIERARCHY_QUERY)
        id_to_parents = multidict(
            (str(child), str(parent))
            for child, parent in cursor.fetchall()
        )

    for identifier, reference in id_to_reference.items():
        parents = []
        for parent_identifier in id_to_parents.get(identifier, []):
            if parent_identifier == '0':  # this means it's a plant
                continue
            parents.append(id_to_reference[parent_identifier])
        term = Term(
            reference=reference,
            parents=parents,
        )
        yield term
Ejemplo n.º 3
0
def iter_terms() -> Iterable[Term]:
    """Get ITIS terms."""
    zip_path = ensure_path(PREFIX, URL)
    sqlite_path = prefix_directory_join(PREFIX, 'itisSqlite043020',
                                        'ITIS.sqlite')
    if not os.path.exists(sqlite_path):
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(get_prefix_directory(PREFIX))

    if not os.path.exists(sqlite_path):
        raise FileNotFoundError(f'file missing: {sqlite_path}')

    conn = sqlite3.connect(sqlite_path)

    with closing(conn.cursor()) as cursor:
        cursor.execute(LONGNAMES_QUERY)
        id_to_reference = {
            str(identifier): Reference(prefix=PREFIX,
                                       identifier=str(identifier),
                                       name=name)
            for identifier, name in cursor.fetchall()
        }

    with closing(conn.cursor()) as cursor:
        cursor.execute(HIERARCHY_QUERY)
        id_to_parents = multidict(
            (str(child), str(parent)) for child, parent in cursor.fetchall())

    for identifier, reference in id_to_reference.items():
        parents = []
        for parent_identifier in id_to_parents.get(identifier, []):
            if parent_identifier == '0':  # this means its a plant
                continue
            parents.append(id_to_reference[parent_identifier])
        term = Term(
            reference=reference,
            parents=parents,
        )
        yield term
Ejemplo n.º 4
0
def _lookup(name):
    if name in taxonomy_remapping:
        return taxonomy_remapping[name]
    return get_name_id_mapping('ncbitaxon')[name]


def get_df() -> pd.DataFrame:
    """Get the BioGRID identifiers mapping dataframe."""
    df = ensure_df(PREFIX, URL, skiprows=28, dtype=str)
    df['taxonomy_id'] = df['ORGANISM_OFFICIAL_NAME'].map(_lookup)
    return df


@cached_mapping(
    path=prefix_directory_join(PREFIX, 'cache', 'xrefs', 'ncbigene.tsv'),
    header=['biogrid_id', 'ncbigene_id'],
)
def get_ncbigene_mapping() -> Mapping[str, str]:
    """Get BioGRID to NCBIGENE mapping.

    Is basically equivalent to:

    .. code-block:: python

        from pyobo import get_filtered_xrefs
        biogrid_ncbigene_mapping = get_filtered_xrefs('biogrid', 'ncbigene')
    """
    df = get_df()
    df = df.loc[df['IDENTIFIER_TYPE'] == 'ENTREZ_GENE',
                ['BIOGRID_ID', 'IDENTIFIER_VALUE']]
Ejemplo n.º 5
0
    return get_name_id_mapping('ncbitaxon')[name]


def get_df() -> pd.DataFrame:
    """Get the BioGRID identifiers mapping dataframe."""
    version = bioversions.get_version('biogrid')
    url = f'{BASE_URL}/BIOGRID-{version}/BIOGRID-IDENTIFIERS-{version}.tab.zip'
    df = ensure_df(PREFIX, url=url, skiprows=28, dtype=str, version=version)
    df['taxonomy_id'] = df['ORGANISM_OFFICIAL_NAME'].map(_lookup)
    return df


@cached_mapping(
    path=prefix_directory_join(PREFIX,
                               'cache',
                               'xrefs',
                               'ncbigene.tsv',
                               version=version_getter(PREFIX)),
    header=['biogrid_id', 'ncbigene_id'],
)
def get_ncbigene_mapping() -> Mapping[str, str]:
    """Get BioGRID to NCBIGENE mapping.

    Is basically equivalent to:

    .. code-block:: python

        from pyobo import get_filtered_xrefs
        biogrid_ncbigene_mapping = get_filtered_xrefs('biogrid', 'ncbigene')
    """
    df = get_df()
Ejemplo n.º 6
0
def _get_complexportal_df():
    return pd.read_csv(COMPLEXPORTAL_MAPPINGS, sep='\t', header=None, names=['source_id', 'target_id'])


def get_intact_complex_portal_xrefs_df() -> pd.DataFrame:
    """Get IntAct-Complex Portal xrefs."""
    df = _get_complexportal_df()
    df['source_ns'] = 'intact'
    df['target_ns'] = 'complexportal'
    df['source'] = COMPLEXPORTAL_MAPPINGS
    df = df[['source_ns', 'source_id', 'target_ns', 'target_id', 'source']]
    return df


@cached_mapping(
    path=prefix_directory_join('intact', 'cache', 'xrefs', 'complexportal.tsv'),
    header=['intact_id', 'complexportal_id'],
)
def get_complexportal_mapping() -> Mapping[str, str]:
    """Get IntAct to Complex Portal mapping.

    Is basically equivalent to:

    .. code-block:: python

        from pyobo import get_filtered_xrefs
        intact_complexportal_mapping = get_filtered_xrefs('intact', 'complexportal')
    """
    df = _get_complexportal_df()
    return dict(df.values)