コード例 #1
0
def fill_missing_values(session: Session, only_kegg: bool, batch_size: int,
                        error_log: str) -> None:
    """
    Complete missing mass and/or atom bag information from InChI strings.

    Parameters
    ----------
    session : sqlalchemy.orm.session.Session
        An active session in order to communicate with a SQL database.
    only_kegg : bool
        Calculate thermodynamic information for compounds contained in KEGG
        only.
    batch_size : int
        The size of batches of compounds considered at a time.
    error_log : str
        The base file path for error output.

    """
    query = session.query(Compound.id, Compound.mnx_id, Compound.inchi)

    if only_kegg:
        # Filter compounds in KEGG or COCO (additional compounds for
        # component-contribution)
        query = query.join(CompoundIdentifier).join(Registry)
        query = query.filter(Registry.namespace.in_(("kegg", "coco")))
        query = query.group_by(Compound.id)

    query = query.filter(
        Compound.inchi.isnot(None),
        or_(Compound.mass.is_(None), Compound.atom_bag.is_(None)),
    )

    logger.debug("calculating mass for compounds with missing values")
    input_df = pd.read_sql_query(query.statement, query.session.bind)

    with tqdm(total=len(input_df), desc="Analyzed") as pbar:
        for index in range(0, len(input_df), batch_size):
            view = input_df.iloc[index:index + batch_size, :]
            try:
                view = get_molecular_masses(view, f"{error_log}_batch_{index}")
                compounds = []
                for row in view.itertuples(index=False):
                    try:
                        atom_bag = get_atom_bag("inchi", row.inchi)
                    except OSError as e:
                        logger.warning(str(e))
                        atom_bag = {}
                    compounds.append({
                        "id": row.id,
                        "mass": row.mass,
                        "atom_bag": atom_bag
                    })
                session.bulk_update_mappings(Compound, compounds)
                session.commit()
            except ValueError as e:
                logger.warning(str(e))

            pbar.update(len(view))
コード例 #2
0
def update_headlines(session: Session, user_dict: Path,
                     logger: Logger) -> None:

    query_result = session \
        .query(Headline) \
        .filter(Headline.is_used.is_(None)) \
        .all()
    headlines = list(query_result)

    if len(headlines) == 0:
        return

    tokenizer = Tokenizer(str(user_dict))
    mappings = []

    logger.info('start updating headlines')
    for headline in tqdm(headlines):

        h = simplify_headline(headline.headline)

        is_about_di = headline.categories is not None and \
            DOMESTIC_INDEX in headline.categories

        # We stopped using `is_template` because the size of the dataset decreased and the result got worse.
        # if is_template(h) or not is_interesting(h) or not is_about_di:
        if not is_interesting(h) or not is_about_di:
            mappings.append({
                'article_id': headline.article_id,
                'is_used': False
            })
            continue

        tokens = kansuuzi2number(
            [token.surface for token in tokenizer.tokenize(h)])
        tag_tokens = replace_prices_with_tags(tokens)

        mappings.append({
            'article_id': headline.article_id,
            'simple_headline': h,
            'tokens': tokens,
            'tag_tokens': tag_tokens,
            'is_used': True,
        })
    session.bulk_update_mappings(Headline, mappings)
    session.commit()
    logger.info('end updating headlines')