Exemple #1
0
def main(_):
    os.environ['CUDA_VISIBLE_DEVICES'] = "0"

    biobert = BioBERT(FLAGS)

    from convert import pubtator_biocxml2dict_list
    import json

    dl = pubtator_biocxml2dict_list(
        [26658955, 24189420, 22579007, 29185436])
    for d in dl:
        print(d['pmid'])
        with open('/media/donghyeon/f7c53837-2156-4793-b2b1-4b0578dffef1'
                  '/biobert/BioBert_NER/BioBERTNER/data/' + d['pmid'] + '.json',
                  'w', encoding='utf-8') as f_out:
            json.dump(d, f_out)
        biobert.recognize([d])

    biobert.close()

    show_prof_data()
Exemple #2
0
    def do_GET(self):
        get_start_t = time.time()
        parsed_path = urlparse.urlparse(self.path)
        cur_thread_name = threading.current_thread().getName()

        message = '\n'.join([
            'CLIENT VALUES:',
            'client_address=%s (%s)' %
            (self.client_address, self.address_string()),
            'command=%s' % self.command,
            'path=%s' % self.path,
            'real path=%s' % parsed_path.path,
            'query=%s' % parsed_path.query,
            'request_version=%s' % self.request_version,
            '',
            'SERVER VALUES:',
            'server_version=%s' % self.server_version,
            'sys_version=%s' % self.sys_version,
            'protocol_version=%s' % self.protocol_version,
            'thread_name=%s' % cur_thread_name,
        ])
        self.send_response(200)
        self.end_headers()

        elapsed_time_dict = dict()

        time_format = self.stm_dict['time_format']
        available_formats = self.stm_dict['available_formats']

        if parsed_path.query is None:
            err_msg = 'No url query'
            print(datetime.now().strftime(time_format),
                  '[' + cur_thread_name + ']', err_msg)
            message += '\n' + err_msg
            self.wfile.write(message.encode('utf-8'))
            return

        indent = None

        # print(datetime.now().strftime(time_format),
        #       'query', parsed_path.query)

        qs_dict = urlparse.parse_qs(parsed_path.query)
        # print(datetime.now().strftime(time_format), 'qs_dict', qs_dict)

        if 'pmid' not in qs_dict or len(qs_dict['pmid']) == 0:
            err_msg = 'No pmid param'
            print(datetime.now().strftime(time_format),
                  '[' + cur_thread_name + ']', err_msg)
            message += '\n' + err_msg
            self.wfile.write(message.encode('utf-8'))
            return

        pmid_list = qs_dict['pmid'][0].split(',')
        # print(datetime.now().strftime(time_format), 'pmid', pmid_list)

        if len(pmid_list) > self.stm_dict['n_pmid_limit']:
            err_msg = 'Too many (> {}) pmids: {}'.format(
                self.stm_dict['n_pmid_limit'], len(pmid_list))
            print(datetime.now().strftime(time_format),
                  '[' + cur_thread_name + ']', err_msg)
            message += '\n' + err_msg
            self.wfile.write(message.encode('utf-8'))
            return

        out_format = available_formats[0]
        if 'format' in qs_dict and len(qs_dict['format']) > 0:
            if qs_dict['format'][0] in available_formats:
                out_format = qs_dict['format'][0]
            else:
                print('Unavailable format', qs_dict['format'][0])

        # print(datetime.now().strftime(time_format),
        #       'pmid:', pmid_list, ', format:', out_format)

        if 'indent' in qs_dict and len(qs_dict['indent']) > 0:
            indent = qs_dict['indent'][0]
            if 'true' == indent.lower():
                indent = 4
            else:
                indent = None

        text_hash = \
            hashlib.sha224(qs_dict['pmid'][0].encode('utf-8')).hexdigest()
        print(datetime.now().strftime(time_format),
              '[{}] text_hash: {}'.format(cur_thread_name, text_hash))

        # bern_output_path = './output/bern_api_{}.{}'.format(text_hash,
        #                                                     out_format)

        # # Re-use prev. outputs
        # if os.path.exists(bern_output_path):
        #     with open(bern_output_path, 'r', encoding='utf-8') as f_out:
        #         if out_format == 'json':
        #             message = \
        #                 json.dumps(json.load(f_out), indent=indent,
        #                            sort_keys=indent is not None)
        #         elif out_format == 'pubtator':
        #             message = f_out.read()
        #         else:
        #             raise ValueError('Wrong format: {}'.format(out_format))
        #
        #     self.wfile.write(message.encode('utf-8'))
        #     print(datetime.now().strftime(time_format),
        #           '[{}] Done. Found prev. output. Total {:.3f} sec\n'.
        #           format(cur_thread_name, time.time() - get_start_t))
        #     return

        is_raw_text = False

        tmtool_start_t = time.time()
        dict_list = pubtator_biocxml2dict_list(pmid_list)
        tmtool_time = time.time() - tmtool_start_t
        elapsed_time_dict['tmtool'] = round(tmtool_time, 3)
        if dict_list is None:
            error_dict = self.get_err_dict()
            error_dict['pmid'] = pmid_list[0] if len(pmid_list) == 1 else ''
            error_dict['abstract'] = 'error: tmtool: no response'
            print(datetime.now().strftime(time_format),
                  '[' + cur_thread_name + ']', error_dict['abstract'])

            if out_format == available_formats[0]:
                self.wfile.write(
                    json.dumps([
                        get_pub_annotation(error_dict, is_raw_text=is_raw_text)
                    ],
                               indent=indent,
                               sort_keys=indent is not None).encode('utf-8'))
            elif out_format == available_formats[1]:
                self.wfile.write(get_pubtator([error_dict]).encode('utf-8'))

            return
        elif type(dict_list) is str:
            error_dict = self.get_err_dict()
            error_dict['pmid'] = pmid_list[0] if len(pmid_list) == 1 else ''
            if 'currently unavailable' in dict_list:
                error_dict['abstract'] = 'error: tmtool: currently unavailable'
            elif 'invalid version format' in dict_list:
                error_dict[
                    'abstract'] = 'error: tmtool: invalid version format'
            else:
                error_dict['abstract'] = 'error: tmtool: {}'.format(
                    dict_list.replace('\n', ''))
            print(datetime.now().strftime(time_format),
                  '[' + cur_thread_name + ']', error_dict['abstract'])

            if out_format == available_formats[0]:
                self.wfile.write(
                    json.dumps([
                        get_pub_annotation(error_dict, is_raw_text=is_raw_text)
                    ],
                               indent=indent,
                               sort_keys=indent is not None).encode('utf-8'))
            elif out_format == available_formats[1]:
                self.wfile.write(get_pubtator([error_dict]).encode('utf-8'))

            return

        print(
            datetime.now().strftime(time_format),
            '[{}] tmTool: PubMed & GNormPlus & tmVar {:.3f} sec'.format(
                cur_thread_name, tmtool_time))

        # Run BioBERT NER models of Lee et al., 2019
        ner_start_time = time.time()
        tagged_docs, num_entities = \
            self.biobert_recognize(dict_list, is_raw_text, cur_thread_name)
        ner_time = time.time() - ner_start_time
        elapsed_time_dict['ner'] = round(ner_time, 3)
        if tagged_docs is None:
            error_dict = self.get_err_dict()
            error_dict['pmid'] = pmid_list[0] if len(pmid_list) == 1 else ''
            error_dict['abstract'] = 'error: BioBERT NER, out of index range'

            if out_format == available_formats[0]:
                self.wfile.write(
                    json.dumps([
                        get_pub_annotation(error_dict, is_raw_text=is_raw_text)
                    ],
                               indent=indent,
                               sort_keys=indent is not None).encode('utf-8'))
            elif out_format == available_formats[1]:
                self.wfile.write(get_pubtator([error_dict]).encode('utf-8'))

            return
        print(
            datetime.now().strftime(time_format),
            '[%s] NER %.3f sec, #entities: %d, #articles: %d' %
            (cur_thread_name, ner_time, num_entities, len(tagged_docs)))

        # Normalization models
        normalization_time = 0.
        if num_entities > 0:
            # print(datetime.now().strftime(time_format),
            #       '[{}] Normalization models..'.format(cur_thread_name))
            normalization_start_time = time.time()
            tagged_docs = self.normalizer.normalize(text_hash,
                                                    tagged_docs,
                                                    cur_thread_name,
                                                    is_raw_text=is_raw_text)
            normalization_time = time.time() - normalization_start_time
        elapsed_time_dict['normalization'] = round(normalization_time, 3)

        # apply output format
        if out_format == available_formats[0]:

            elapsed_time_dict['total'] = round(time.time() - get_start_t, 3)

            # PubAnnotation JSON
            pubannotation_res = list()
            for d in tagged_docs:
                pubannotation_res.append(
                    get_pub_annotation(d,
                                       is_raw_text=is_raw_text,
                                       elapsed_time_dict=elapsed_time_dict))
            self.wfile.write(
                json.dumps(pubannotation_res,
                           indent=indent,
                           sort_keys=indent is not None).encode('utf-8'))

            # # Save a BERN result
            # with open(bern_output_path, 'w', encoding='utf-8') as f_out:
            #     json.dump(pubannotation_res, f_out)

        elif out_format == available_formats[1]:
            # PubTator
            self.wfile.write(get_pubtator(tagged_docs).encode('utf-8'))

            # # Save a BERN result
            # with open(bern_output_path, 'w', encoding='utf-8') as f_out:
            #     f_out.write(pubtator_res)

        print(
            datetime.now().strftime(time_format),
            '[{}] Done. Total {:.3f} sec\n'.format(cur_thread_name,
                                                   time.time() - get_start_t))
        return
Exemple #3
0
    t = t.replace('\r', '')
    t = t.replace('\n', ' ')
    t = t.replace('\t', ' ')
    t = t.replace('
\u2028
', ' ')
    t = t.replace('\u2029', ' ')
    return t


if __name__ == '__main__':
    from convert import pubtator_biocxml2dict_list

    # testpmids = [25681199, 29446767]
    testpmids = [29446767, 25681199]
    # print(query_pubtator2([29446767,25681199]))
    # print(query_pubtator2([25681199]))
    print(pubtator_biocxml2dict_list(testpmids))

    import sys
    sys.exit(0)

    get_pmc_archive(
        os.path.expanduser('~') + '/bestplus/pmc/oa_non_comm_use_pdf.txt',
        os.path.expanduser('~') + '/bestplus/pmc/non_comm',
        os.path.expanduser('~') + '/bestplus/pmc/oa_comm_use_file_list.txt',
        os.path.expanduser('~') + '/bestplus/pmc/comm'
    )

    from convert import pubtatorstr2dict_list

    # raw texts & demo
    bern_raw_res = query_raw_bern('Results We identified five activating mutations in the PIK3CA gene in affected tissues from 6 of the 9 patients studied; one of the variants (NM_006218.2:c.248T>C; p.Phe83Ser) has not been previously described in developmental disorders.')