コード例 #1
0
ファイル: get_subfigures.py プロジェクト: yfpeng/PMCFigureX
def get_subfigure_f(db_file, image_dir, subfigures_dst):
    conn = sqlite3.connect(db_file)

    df = select_helper(conn, sql_select_empty_subfigurejsonfiles,
                       ['pmcid', 'figure_name'])

    insert_time = f'{datetime.datetime.now():%Y-%m-%d-%H-%M-%S}'
    insert_subfigure_helper = DBHelper(conn, sql_insert_subfigure)
    insert_subfigure_helper.start()
    for pmcid, figure_name in tqdm.tqdm(zip(df['pmcid'], df['figure_name']),
                                        total=len(df),
                                        desc='Query new subfigures'):
        figure_file = image_dir / generate_path(pmcid) / '{}_{}'.format(
            pmcid, figure_name)
        records = split_subfigure(figure_file)
        records_to_insert = [(pmcid, figure_name, r.xtl, r.ytl, r.xbr, r.ybr,
                              'PMCFigureX', insert_time) for r in records]
        insert_subfigure_helper.extend(records_to_insert)
    insert_subfigure_helper.finish()
    insert_subfigure_helper.summarize()

    # get all subfigures
    df = select_helper(conn, sql_select_subfigure,
                       ['pmcid', 'figure_name', 'xtl', 'ytl', 'xbr', 'ybr'])
    data = []
    for i, row in tqdm.tqdm(df.iterrows(),
                            total=len(df),
                            desc='Get subfigures'):
        pmcid = row['pmcid']
        figure_name = row['figure_name']
        xtl = row['xtl']
        ytl = row['ytl']
        xbr = row['xbr']
        ybr = row['ybr']
        figure_file = generate_path(pmcid) / '{}_{}'.format(pmcid, figure_name)
        dst = figure_file.parent / f'{figure_file.stem}_{xtl}x{ytl}_{xbr}x{ybr}{figure_file.suffix}'
        data.append({
            'pmcid': pmcid,
            'figure path': str(dst.as_posix()),
            'xtl': xtl,
            'ytl': ytl,
            'xbr': xbr,
            'ybr': ybr,
            'type': 'subfigure'
        })
    df = pd.DataFrame(data)
    df.to_csv(subfigures_dst, index=False)
    conn.close()
コード例 #2
0
ファイル: get_bioc.py プロジェクト: yfpeng/PMCFigureX
def get_bioc_f(db_file, bioc_dir):
    conn = sqlite3.connect(db_file)
    df = select_helper(conn, sql_select_empty_bioc, ['pmcid', 'pmid'])
    #
    cnt = collections.Counter()
    update_article_helper = DBHelper(conn, sql_update_articles)
    update_article_helper.start()
    for pmcid, pmid in tqdm.tqdm(zip(df['pmcid'], df['pmid']), total=len(df)):
        cnt['total pmc'] += 1
        dst_dir = bioc_dir / generate_path(pmcid)
        dst = dst_dir / f'{pmcid}.xml'
        if dst.exists():
            update_article_helper.append((1, pmcid))
        else:
            try:
                get_bioc(pmid, dst)
                cnt['new bioc'] += 1
                update_article_helper.append((1, pmcid))
            except urllib.error.HTTPError:
                update_article_helper.append((0, pmcid))

    update_article_helper.finish()
    conn.close()

    for k, v in cnt.most_common():
        print(k, ':', v)
コード例 #3
0
ファイル: get_figures.py プロジェクト: yfpeng/PMCFigureX
def get_figures(db_file, image_dir):
    conn = sqlite3.connect(db_file)

    df = select_helper(conn, sql_get_empty_figures, ['pmcid', 'figure_name'])

    cnt = collections.Counter()
    update_figure_helper = DBHelper(conn, sql_update_figure_size)
    update_figure_helper.start()
    for pmcid, figure_name in tqdm.tqdm(zip(df['pmcid'], df['figure_name']),
                                        total=len(df)):
        local_file = image_dir / generate_path(pmcid) / '{}_{}'.format(
            pmcid, figure_name)
        if not local_file.exists():
            try:
                url = f'https://www.ncbi.nlm.nih.gov/pmc/articles/{pmcid}/bin/{figure_name}'
                urllib.request.urlretrieve(url, local_file)
                cnt['new figure'] += 1
            except urllib.error.HTTPError:
                cnt['Http error'] += 1
                with open(local_file, 'w') as _:
                    pass

        try:
            im = Image.open(local_file)
            update_figure_helper.append(
                (im.width, im.height, pmcid, figure_name))
        except:
            cnt['Image error'] += 1
        cnt['total figure'] += 1

    update_figure_helper.finish()
    conn.close()

    ppprint.pprint_counter(cnt, percentage=False)
コード例 #4
0
def get_figure_url(db_file, bioc_dir):
    conn = sqlite3.connect(db_file)

    df = select_helper(conn, sql_select_new_bioc, ['pmcid'])
    #
    insert_time = f'{datetime.datetime.now():%Y-%m-%d-%H-%M-%S}'
    insert_helper = DBHelper(conn, sql_insert_figure)
    insert_helper.start()
    for pmcid in tqdm.tqdm(df['pmcid'], total=len(df)):
        biocfile = bioc_dir / generate_path(pmcid) / f'{pmcid}.xml'
        figure_names = get_figure_link(biocfile)
        insert_helper.extend(
            set([(pmcid, figure_name, insert_time)
                 for figure_name in figure_names]))
    insert_helper.finish()

    conn.close()
コード例 #5
0
ファイル: split_figures.py プロジェクト: yfpeng/PMCFigureX
def split_figure_f(db_file, image_dir, model_pathname, batch_size=16):
    conn = sqlite3.connect(db_file)
    df = select_helper(conn, sql_select_empty_subfigurejsonfiles,
                       ['pmcid', 'figure_name'])
    conn.close()

    cnt = collections.Counter()
    tf.compat.v1.disable_eager_execution()
    separator = FigureSeparator(str(model_pathname))
    with tf.compat.v1.Session(graph=separator.graph) as sess:
        needs_to_split = []

        def split_and_save():
            srcs = [r[0] for r in needs_to_split]
            dsts = [r[1] for r in needs_to_split]
            results = separator.extract_batch(sess, srcs)
            assert len(results) == len(srcs)
            for dst, result in zip(dsts, results):
                subfigures = result['sub_figures']
                json.dump(subfigures, open(dst, 'w'))

        for pmcid, figure_name in tqdm.tqdm(zip(df['pmcid'],
                                                df['figure_name']),
                                            total=len(df)):
            src = image_dir / generate_path(pmcid) / '{}_{}'.format(
                pmcid, figure_name)
            dst = src.with_suffix('.json')
            if not dst.exists():
                needs_to_split.append((src, dst))
                if len(needs_to_split) >= batch_size:
                    split_and_save()
                    needs_to_split = []
            else:
                if is_file_empty(src):
                    cnt['empty figure'] += 1
                    continue
        if len(needs_to_split) > 0:
            split_and_save()

    for k, v in cnt.most_common():
        print(k, ':', v)
コード例 #6
0
def get_pmc_from_pmid_f(src, db_file):
    pubmed_export_df = pd.read_csv(src, sep='\t', dtype=str, comment='#')

    new_pmids = set(pubmed_export_df['pmid'])
    conn = sqlite3.connect(os.path.expanduser(db_file))

    history_df = select_helper(conn, sql_select_articles, columns=['pmid'])
    new_pmids = new_pmids - set(history_df['pmid'])

    insert_time = f'{datetime.datetime.now():%Y-%m-%d-%H-%M-%S}'
    pmids = list(new_pmids)

    insert_articles_helper = DBHelper(conn, sql_insert_articles)
    insert_articles_helper.start()
    for i in tqdm.tqdm(range(0, len(pmids), 200)):
        results = get_pmc_from_pmid(pmids[i:i + 200])
        for pmid, v in results.items():
            insert_articles_helper.append(
                (v['pmcid'], pmid, v['doi'], insert_time))

    insert_articles_helper.finish()
    conn.close()
コード例 #7
0
ファイル: get_subfigures.py プロジェクト: yfpeng/PMCFigureX
def get_figure_f(db_file, image_dir, figures_dst):
    conn = sqlite3.connect(db_file)
    df = select_helper(conn, sql_select_figure,
                       ['pmcid', 'figure_name', 'width', 'height'])
    data = []
    for i, row in tqdm.tqdm(df.iterrows(),
                            total=len(df),
                            desc='Get whole figures'):
        pmcid = row['pmcid']
        figure_name = row['figure_name']
        figure_file = generate_path(pmcid) / '{}_{}'.format(pmcid, figure_name)
        data.append({
            'pmcid': pmcid,
            'figure path': str(figure_file.as_posix()),
            'xtl': 0,
            'ytl': 0,
            'xbr': row['width'],
            'ybr': row['height'],
            'type': 'figure'
        })

    df = pd.DataFrame(data)
    df.to_csv(figures_dst, index=False)
    conn.close()