Exemple #1
0
def get_subfigure_f(db_file, image_dir, subfigures_dst):
    conn = sqlite3.connect(db_file)

    df = select_helper(conn, sql_select_empty_subfigurejsonfiles,
                       ['pmcid', 'figure_name'])

    insert_time = f'{datetime.datetime.now():%Y-%m-%d-%H-%M-%S}'
    insert_subfigure_helper = DBHelper(conn, sql_insert_subfigure)
    insert_subfigure_helper.start()
    for pmcid, figure_name in tqdm.tqdm(zip(df['pmcid'], df['figure_name']),
                                        total=len(df),
                                        desc='Query new subfigures'):
        figure_file = image_dir / generate_path(pmcid) / '{}_{}'.format(
            pmcid, figure_name)
        records = split_subfigure(figure_file)
        records_to_insert = [(pmcid, figure_name, r.xtl, r.ytl, r.xbr, r.ybr,
                              'PMCFigureX', insert_time) for r in records]
        insert_subfigure_helper.extend(records_to_insert)
    insert_subfigure_helper.finish()
    insert_subfigure_helper.summarize()

    # get all subfigures
    df = select_helper(conn, sql_select_subfigure,
                       ['pmcid', 'figure_name', 'xtl', 'ytl', 'xbr', 'ybr'])
    data = []
    for i, row in tqdm.tqdm(df.iterrows(),
                            total=len(df),
                            desc='Get subfigures'):
        pmcid = row['pmcid']
        figure_name = row['figure_name']
        xtl = row['xtl']
        ytl = row['ytl']
        xbr = row['xbr']
        ybr = row['ybr']
        figure_file = generate_path(pmcid) / '{}_{}'.format(pmcid, figure_name)
        dst = figure_file.parent / f'{figure_file.stem}_{xtl}x{ytl}_{xbr}x{ybr}{figure_file.suffix}'
        data.append({
            'pmcid': pmcid,
            'figure path': str(dst.as_posix()),
            'xtl': xtl,
            'ytl': ytl,
            'xbr': xbr,
            'ybr': ybr,
            'type': 'subfigure'
        })
    df = pd.DataFrame(data)
    df.to_csv(subfigures_dst, index=False)
    conn.close()
Exemple #2
0
def get_bioc_f(db_file, bioc_dir):
    conn = sqlite3.connect(db_file)
    df = select_helper(conn, sql_select_empty_bioc, ['pmcid', 'pmid'])
    #
    cnt = collections.Counter()
    update_article_helper = DBHelper(conn, sql_update_articles)
    update_article_helper.start()
    for pmcid, pmid in tqdm.tqdm(zip(df['pmcid'], df['pmid']), total=len(df)):
        cnt['total pmc'] += 1
        dst_dir = bioc_dir / generate_path(pmcid)
        dst = dst_dir / f'{pmcid}.xml'
        if dst.exists():
            update_article_helper.append((1, pmcid))
        else:
            try:
                get_bioc(pmid, dst)
                cnt['new bioc'] += 1
                update_article_helper.append((1, pmcid))
            except urllib.error.HTTPError:
                update_article_helper.append((0, pmcid))

    update_article_helper.finish()
    conn.close()

    for k, v in cnt.most_common():
        print(k, ':', v)
Exemple #3
0
def get_figures(db_file, image_dir):
    conn = sqlite3.connect(db_file)

    df = select_helper(conn, sql_get_empty_figures, ['pmcid', 'figure_name'])

    cnt = collections.Counter()
    update_figure_helper = DBHelper(conn, sql_update_figure_size)
    update_figure_helper.start()
    for pmcid, figure_name in tqdm.tqdm(zip(df['pmcid'], df['figure_name']),
                                        total=len(df)):
        local_file = image_dir / generate_path(pmcid) / '{}_{}'.format(
            pmcid, figure_name)
        if not local_file.exists():
            try:
                url = f'https://www.ncbi.nlm.nih.gov/pmc/articles/{pmcid}/bin/{figure_name}'
                urllib.request.urlretrieve(url, local_file)
                cnt['new figure'] += 1
            except urllib.error.HTTPError:
                cnt['Http error'] += 1
                with open(local_file, 'w') as _:
                    pass

        try:
            im = Image.open(local_file)
            update_figure_helper.append(
                (im.width, im.height, pmcid, figure_name))
        except:
            cnt['Image error'] += 1
        cnt['total figure'] += 1

    update_figure_helper.finish()
    conn.close()

    ppprint.pprint_counter(cnt, percentage=False)
Exemple #4
0
def move1(src_dir, dst_dir):
    with os.scandir(src_dir) as it:
        for entry in tqdm.tqdm(it):
            src = entry.path
            pmcid = Path(src).stem
            parent_dir = dst_dir / generate_path(pmcid)
            parent_dir.mkdir(parents=True, exist_ok=True)
            dst = parent_dir / f'{pmcid}.xml'
            shutil.move(src, dst)
Exemple #5
0
def get_figure_url(db_file, bioc_dir):
    conn = sqlite3.connect(db_file)

    df = select_helper(conn, sql_select_new_bioc, ['pmcid'])
    #
    insert_time = f'{datetime.datetime.now():%Y-%m-%d-%H-%M-%S}'
    insert_helper = DBHelper(conn, sql_insert_figure)
    insert_helper.start()
    for pmcid in tqdm.tqdm(df['pmcid'], total=len(df)):
        biocfile = bioc_dir / generate_path(pmcid) / f'{pmcid}.xml'
        figure_names = get_figure_link(biocfile)
        insert_helper.extend(
            set([(pmcid, figure_name, insert_time)
                 for figure_name in figure_names]))
    insert_helper.finish()

    conn.close()
Exemple #6
0
def get_figure_text(src1, src2, dest, history_file, bioc_dir):
    df1 = pd.read_csv(src1, dtype=str)
    df2 = pd.read_csv(src2, dtype=str)
    df = pd.concat([df1, df2], axis=0)
    figures = create_figures(df, history_file=history_file)

    docs = {}  # type: Dict[str, bioc.BioCDocument]
    for figure in figures:
        pmcid = figure.pmcid
        if pmcid not in docs:
            src = bioc_dir / generate_path(pmcid) / f'{pmcid}.xml'
            collection = bioc.load(open(src))
            docs[pmcid] = collection.documents[0]
        add_text(figure, docs[figure.pmcid])

    with open(dest, 'w', encoding='utf8') as fp:
        objs = [f.to_dict() for f in figures]
        json.dump(objs, fp, indent=2)
Exemple #7
0
def split_figure_f(db_file, image_dir, model_pathname, batch_size=16):
    conn = sqlite3.connect(db_file)
    df = select_helper(conn, sql_select_empty_subfigurejsonfiles,
                       ['pmcid', 'figure_name'])
    conn.close()

    cnt = collections.Counter()
    tf.compat.v1.disable_eager_execution()
    separator = FigureSeparator(str(model_pathname))
    with tf.compat.v1.Session(graph=separator.graph) as sess:
        needs_to_split = []

        def split_and_save():
            srcs = [r[0] for r in needs_to_split]
            dsts = [r[1] for r in needs_to_split]
            results = separator.extract_batch(sess, srcs)
            assert len(results) == len(srcs)
            for dst, result in zip(dsts, results):
                subfigures = result['sub_figures']
                json.dump(subfigures, open(dst, 'w'))

        for pmcid, figure_name in tqdm.tqdm(zip(df['pmcid'],
                                                df['figure_name']),
                                            total=len(df)):
            src = image_dir / generate_path(pmcid) / '{}_{}'.format(
                pmcid, figure_name)
            dst = src.with_suffix('.json')
            if not dst.exists():
                needs_to_split.append((src, dst))
                if len(needs_to_split) >= batch_size:
                    split_and_save()
                    needs_to_split = []
            else:
                if is_file_empty(src):
                    cnt['empty figure'] += 1
                    continue
        if len(needs_to_split) > 0:
            split_and_save()

    for k, v in cnt.most_common():
        print(k, ':', v)
Exemple #8
0
def get_figure_f(db_file, image_dir, figures_dst):
    conn = sqlite3.connect(db_file)
    df = select_helper(conn, sql_select_figure,
                       ['pmcid', 'figure_name', 'width', 'height'])
    data = []
    for i, row in tqdm.tqdm(df.iterrows(),
                            total=len(df),
                            desc='Get whole figures'):
        pmcid = row['pmcid']
        figure_name = row['figure_name']
        figure_file = generate_path(pmcid) / '{}_{}'.format(pmcid, figure_name)
        data.append({
            'pmcid': pmcid,
            'figure path': str(figure_file.as_posix()),
            'xtl': 0,
            'ytl': 0,
            'xbr': row['width'],
            'ybr': row['height'],
            'type': 'figure'
        })

    df = pd.DataFrame(data)
    df.to_csv(figures_dst, index=False)
    conn.close()