def prepareGloveModel():
    ConfigReader('../conf/smartanno_conf.json')
    glove_path = ConfigReader.getValue('glove/model_path')
    glove_vocab = ConfigReader.getValue('glove/vocab')
    glove_vector = ConfigReader.getValue('glove/vector')
    GloveModel(word2vec_file=glove_path, vocab=glove_vocab, vect=glove_vector)
    gm = GloveModel.glove_model
Beispiel #2
0
    def start(self):
        if ConfigReader.getValue("glove/model_path") is None or len(ConfigReader.getValue("glove/model_path")) == 0:
            self.workflow.steps[self.pos_id + 2].start()
            return

        if not hasattr(self.workflow, 'we_extended'):
            self.workflow.we_extended = dict()
        rows = self.showWords(self.workflow.filters)
        self.box = widgets.VBox(rows, layout=widgets.Layout(display='flex', flex_grown='column'))
        display(self.box)
        pass
Beispiel #3
0
 def start(self):
     cr = ConfigReader()
     self.workflow = Workflow([
         IntroStep(
             '<h2>Welcome to SmartAnno!</h2><h4>Do you want to start from beginning or continue previous reviewing? </h4>',
             name='intro'),
         DBInitiater(name='db_initiator'),
         DirChooser(name='choosedir'),
         ReadFiles(name='readfiles'),
         DocsToDB(name='save2db'),
         TaskChooser(name='tasknamer'),
         DataSetChooser(
             name='dataset_chooser',
             description='<h4>Choose which dateaset you want to use: </h4>'
         ),
         AnnotationTypeDef(
             '<h3>Annotation types:</h3><p>List all the types you want to identify below. Each type per line.<br/>If you'
             'have too many types, try set up them separately, so that you won&apos;t need to choose from a long list '
             'for each sample. </p>',
             name='types'),
         KeywordsFiltering(name='keywords'),
         # PreviousNextIntSlider(value=60, min=0, max=100, step=10,
         #                       description='<h4>Percentage to Filter: </h4><p>Choose how many percent of the samples '
         #                                   'you want to use the keywords filter.</p>', name='percent2filter'),
         KeywordsUMLSExtenderSetup(name='umls_extender_setup'),
         KeywordsUMLSExtender(
             name='umls_extender',
             sources=cr.getValue("umls/sources"),
             filter_by_length=cr.getValue("umls/filter_by_length"),
             filter_by_contains=cr.getValue("umls/filter_by_contains"),
             max_query=cr.getValue("umls/max_query")),
         KeywordsEmbeddingExtenderSetup(name='w_e_extender_setup'),
         KeywordsEmbeddingExtender(name='w_e_extender', max_query=40),
         ReviewRBInit(name="rb_review_init"),
         ReviewRBLoop(name='rb_review'),
         PreviousNextHTML(
             description=
             '<h2>Congratuations!</h2><h4>You have finished the initial review '
             'on the rule-base preannotations. </h4>',
             name='rb_review_done'),
         ReviewMLInit(name='ml_review_init'),
         ReviewMLLoop(name='ml_review',
                      ml_classifier_cls=self.ml_classifier_cls),
         PreviousNextHTML(
             name='finish',
             description=
             '<h3>Well done!</h3><h4>Now you have finished reviewing all the samples. '
         )
     ])
     self.workflow.start(False)
     pass
    def __init__(self,
                 targets=None,
                 modifiers=None,
                 feature_inference_rule=None,
                 document_inference_rule=None,
                 rush_rule=None,
                 expected_values=[],
                 save_markups=True):
        self.document_inferencer = DocumentInferencer(document_inference_rule)
        self.feature_inferencer = FeatureInferencer(feature_inference_rule)
        self.conclusions = []
        self.modifiers = modifiers
        self.targets = targets
        self.save_markups = save_markups
        self.expected_values = [value.lower() for value in expected_values]
        self.saved_markups_map = dict()
        self.pyrush = None
        if rush_rule is None or not os.path.isfile(rush_rule):
            rush_rule = ConfigReader.getValue('rush_rules_path')
        if rush_rule is not None and os.path.isfile(rush_rule):
            self.pyrush = RuSH(rush_rule)
        else:
            logMsg(("File not found", os.path.abspath(rush_rule)))
        self.last_doc_name = ''

        if modifiers is not None and targets is not None:
            if isinstance(modifiers, str) and isinstance(targets, str):
                if (modifiers.endswith('.csv') or modifiers.endswith('.tsv') or modifiers.endswith(
                        '.txt') or modifiers.endswith('.yml')) \
                        and (targets.endswith('.csv') or targets.endswith('.tsv') or targets.endswith(
                    '.txt') or targets.endswith('.yml') or targets.startswith('Lex\t')):
                    self.setModifiersTargetsFromFiles(modifiers, targets)
            else:
                self.setModifiersTargets(modifiers, targets)
        RBDocumentClassifier.instance = self
Beispiel #5
0
 def __init__(self, name=None):
     super().__init__(name)
     self.dao = None
     self.dbpath = ''
     self.remove_old = False
     self.dataset_name = 'orig'
     self.whoosh_root = ConfigReader.getValue("whoosh/root_path")
     self.html1 = widgets.HTML('<h4>Give a name for this dataset: </h4>')
     self.dataset_name_input = None
     self.html2 = None
     self.toggle = widgets.ToggleButtons(
         options=['TextBlob_Splitter', 'PyRuSh', 'Not_To_Split'],
         description='',
         disabled=False,
         value='Not_To_Split',
         button_style='',  # 'success', 'info', 'warning', 'danger' or ''
         tooltips=[
             'Use TextBlob sentence splitter',
             'Use PyRuSH to split sentences', 'don\'t split'
         ],
         layout=widgets.Layout(width='70%')
         #     icons=['check'] * 3
     )
     self.data_step = None
     pass
Beispiel #6
0
    def __init__(self,
                 description='',
                 name=str(Step.global_id + 1),
                 ml_classifier_cls=LogisticBOWClassifier):
        self.sample_size_input = None
        self.percent_slider = None
        self.samples = {"contain": [], "notcontain": []}
        self.box = None
        self.data = None
        self.docs = None
        self.annos = None
        self.reviewed_docs = None
        self.reviewed_pos = None
        self.leftover = None
        self.ready = False
        # reset, continue, addmore,
        self.move_next_option = ''

        self.previousReviewed = OrderedDict()
        self.learning_pace = ConfigReader.getValue('review/ml_learning_pace')
        self.un_reviewed = 0
        self.parameters = dict()
        self.parameter_inputs = dict()
        self.ml_classifier_cls = ml_classifier_cls
        super().__init__(name=name)
        pass
Beispiel #7
0
 def __init__(self, apikey=None):
     # self.username=username
     # self.password=password
     if apikey is not None:
         self.apikey = apikey
     else:
         self.apikey = ConfigReader.getValue('api_key')
     self.service = "http://umlsks.nlm.nih.gov"
Beispiel #8
0
    def __init__(self, description='', name=None):
        self.glove_path = ConfigReader.getValue('glove/model_path')
        self.glove_vocab = ConfigReader.getValue('glove/vocab')
        self.glove_vector = ConfigReader.getValue('glove/vector')
        # widgets to take the user inputs
        self.glove_path_input = None
        self.glove_vocab_input = None
        self.glove_vector_input = None
        self.api_key_input = None

        if self.glove_vocab is None:
            self.glove_vocab = 1900000

        if self.glove_vector is None:
            self.glove_vector = 300
        self.html = widgets.HTML(value=description)
        super().__init__(name)
        pass
 def __init__(
         self,
         description='<h4>Extend keywords through <b>UMLS</b></h4><p>Please select which keywords you want to '
     'check the synonyms from UMLS:',
         name=None):
     self.api_key = ConfigReader.getValue('api_key')
     self.title = widgets.HTML(value=description)
     self.to_ext_words = dict()
     self.to_umls_ext_filters = dict()
     self.api_input = None
     super().__init__(name)
Beispiel #10
0
    def __init__(self, name=str(Step.global_id + 1), **kwargs):
        super().__init__([], name=name)
        self.docs = []
        self.data = dict()
        self.annos = dict()
        self.reviewed_docs = dict()
        self.threshold = ConfigReader.getValue('review/rb_model_threshold')
        self.nlp = None
        self.js = '''<script>
function setFocusToTextBox(){
    var spans = document.getElementsByClassName("highlighter");
    var id=document.getElementById('d1').pos
    if (id===undefined){
      id=0
    }          
    if (id>=spans.length){
        id=0
    }
    var topPos = spans[id].offsetTop;    
    dv=document.getElementById('d1')
    dv.scrollTop = topPos-20;
    dv.pos=id+1;
}
</script>'''
        self.end_js = '''<script>document.getElementById('d1').pos=0;topPos=0;</script>'''
        self.matcher = None
        self.metaColumns = ConfigReader().getValue("review/meta_columns")
        self.div_height = ConfigReader().getValue("review/div_height")
        logMsg(('self.div_height:', self.div_height))
        self.show_meta_name = ConfigReader().getValue("review/show_meta_name")
        self.hightligh_span_tag = ' <span class="highlighter" style="background-color:  %s ">' % ConfigReader(
        ).getValue("review/highlight_color")
        if 'rush_rule' in kwargs:
            self.rush_rule = kwargs['rush_rule']
        else:
            self.rush_rule = ConfigReader.getValue('rush_rules_path')

        pass
Beispiel #11
0
 def requestUMLSAPIKey(self, rows):
     api_key = ConfigReader.getValue("api_key")
     if api_key is None or len(api_key) == 0:
         rows.append(
             widgets.HTML(
                 value=
                 '<h4>Set up your Glove model</h4><p>In order to use word embedding, you need '
                 'to tell where the glove model locates:</p>'))
         self.api_key_input = widgets.Text(value='',
                                           placeholder='',
                                           description='',
                                           disabled=False)
         rows.append(self.api_key_input)
         rows += self.addSeparator()
 def __init__(self, **kwargs):
     self.sample_size = 0
     self.previous_sampled_ids = kwargs['previous_sampled_ids']
     self.dao = kwargs['dao']
     self.dataset_id = 'origin_doc' if 'dataset_id' not in kwargs else kwargs[
         'dataset_id']
     self.ignore_case = True
     self.whoosh_root = ConfigReader.getValue('whoosh/root_path')
     self.grouped_ids = dict()
     self.all_contain_ids = set()
     self.available_not_contain = 0
     self.new_available_not_contain = 0
     self.new_ids = dict()
     self.current_stats = dict()
     pass
Beispiel #13
0
 def requestUMLSAPIKey(self, rows):
     api_key = ConfigReader.getValue("api_key")
     if api_key is None or len(api_key) == 0:
         rows.append(
             widgets.HTML(
                 value='<h4>Set your API Key</h4><p>In order to use the UMLS synonym checking module, you need to set'
                       ' up your API key: (<a href="https://www.nlm.nih.gov/research/umls/user_education/quick_tours/'
                       'UTS-API/UTS_REST_API_Authentication.html" target="_blank">How to get your API Key_at 01:12 from'
                       ' beginning. </a>)</p><p>If you do not set the api key, the UMLS synonym extender will be '
                       '<b>skipped</b>.</p>'))
         self.api_key_input = widgets.Text(value='',
                                           placeholder='',
                                           description='', disabled=False)
         rows.append(self.api_key_input)
         rows += self.addSeparator()
Beispiel #14
0
 def __init__(self,
              description='',
              name=str(Step.global_id + 1),
              sampler_cls: type = KeywordStratefiedSampler):
     super().__init__(name=name)
     self.toggle = widgets.ToggleButtons(
         options=sample_options,
         value=sample_options[-1],
         description='What to do with previously sampled data? ',
         style=dict(description_width='initial'),
         button_style='info')
     self.toggle.observe(self.onPreviousSampleHandleChange)
     self.sample_size_input = widgets.BoundedIntText(
         value=0,
         min=0,
         max=0,
         step=1,
         description='Total documents you want to sample:',
         style=dict(description_width='initial'))
     self.sample_size_input.observe(self.onSampleConfigChange)
     self.sampler_cls = sampler_cls
     self.sampled_summary = widgets.HTML(value='')
     self.percent_slider = widgets.IntSlider(value=70,
                                             min=0,
                                             max=100,
                                             step=5,
                                             description='',
                                             disabled=False,
                                             continuous_update=False,
                                             orientation='horizontal',
                                             readout=True,
                                             readout_format='d')
     self.percent_slider.observe(self.onSampleConfigChange)
     # save DOC_IDs that contain or not contain keywords filters (used in sampling strategy)
     self.samples = {"contain": [], "notcontain": []}
     self.box = None
     self.data = {'docs': [], 'annos': OrderedDict()}
     self.ready = False
     # reset, continue, addmore,
     self.move_next_option = ''
     self.total = None
     self.total_contains = None
     self.un_reviewed = 0
     self.sampler = None
     self.samples = dict()
     self.current_stats = dict()
     self.max_threshold = ConfigReader.getValue("review/rb_model_threshold")
     self.sample_sizes = dict()
Beispiel #15
0
 def navigate(self, button):
     if self.glove_path_input is not None:
         self.saveGloveConfig()
     if self.api_key_input is not None:
         self.saveAPIKey()
     else:
         self.workflow.api_key = ConfigReader.getValue("api_key")
     self.backgroundWork()
     if button.description == 'ContinueReviewing':
         self.workflow.to_continue = True
         self.workflow.steps[1].start()
         self.workflow.steps[1].complete()
     else:
         self.workflow.to_continue = False
         self.workflow.steps[1].start()
     pass
Beispiel #16
0
    def init_real_time(self):
        self.ml_classifier = self.ml_classifier_cls(
            task_name=self.workflow.task_name)
        self.learning_pace = ConfigReader.getValue("review/ml_learning_pace")
        self.loop_workflow.filters = self.workflow.filters
        self.readData()
        if self.ml_classifier_cls.status == NotTrained:
            self.backgroundTraining()

        self.nlp = ReviewRBInit.nlp
        self.matcher = ReviewRBInit.matcher

        logMsg([doc.DOC_ID for doc in self.docs])
        if self.docs is not None and len(
                self.docs) > 0 and (self.loop_workflow is None
                                    or len(self.loop_workflow.steps) == 0):
            last_doc_pos = len(self.reviewed_docs) + 1 if len(
                self.reviewed_docs) < len(self.docs) else len(
                    self.reviewed_docs)
            for i in range(0, last_doc_pos):
                doc = self.docs[i]
                content = self.genContent(doc)
                reviewed = False
                if doc.DOC_ID in self.annos and self.annos[
                        doc.DOC_ID].REVIEWED_TYPE is not None:
                    prediction = self.annos[doc.DOC_ID].REVIEWED_TYPE
                    reviewed = True
                else:
                    prediction = self.getPrediction(doc)
                repeat_step = ReviewML(
                    description=content,
                    options=self.workflow.types,
                    value=prediction,
                    js=self.js,
                    master=self,
                    reviewed=reviewed,
                    button_style=('success' if reviewed else 'info'))
                self.appendRepeatStep(repeat_step)
        pass
from SmartAnno.utils.AnnotationTypeDef import AnnotationTypeDef
from SmartAnno.utils.KeywordsFiltering import KeywordsFiltering
from SmartAnno.gui.PreviousNextWidgets import PreviousNextHTML
from SmartAnno.utils.ReviewRBInit import ReviewRBInit
from SmartAnno.utils.ReviewRBLoop import ReviewRBLoop
from SmartAnno.utils.ReviewMLInit import ReviewMLInit
from SmartAnno.utils.ReviewMLLoop import ReviewMLLoop
from SmartAnno.models.logistic.LogisticBOWClassifiers import LogisticBOWClassifier
from SmartAnno.utils.DataSetChooser import DataSetChooser

logging.getLogger().setLevel(logging.DEBUG)

ConfigReader('../conf/smartanno_conf.json')

wf = Workflow(config_file=ConfigReader.config_file)
wf.api_key = ConfigReader.getValue("api_key")
wf.dao = Dao('sqlite+pysqlite:///../data/test.sqlite',
             sqlalchemy_dao.POOL_DISABLED)
wf.task_name = 'language'
wf.append(
    AnnotationTypeDef(
        '<h3>Annotation types:</h3><p>List all the types you want to identify below. Each type per line.<br/>If you'
        'have too many types, try set up them separately, so that you won&apos;t need to choose from a long list '
        'for each sample. </p>',
        name='types'))
wf.append(KeywordsFiltering(name='keywords'))
wf.append(
    DataSetChooser(
        name='dataset_chooser',
        description='<h4>Choose which dateaset you want to use: </h4>'))
rb = ReviewRBInit(name="rb_review_init")
Beispiel #18
0
 def restoreStatus(self):
     status = ConfigReader.getValue('status/' + self.name)
     if status is None or status == '':
         status = 0
     return status
Beispiel #19
0
from SmartAnno.utils.ConfigReader import ConfigReader
from SmartAnno.umls.UMLSFinder import UMLSFinder
ConfigReader()
umls = UMLSFinder(ConfigReader.getValue("api_key"),
                  sources=[],
                  filter_by_length=5,
                  max_query=50,
                  filter_by_contains=True)
print(umls.search("ketoacidosis"))
Beispiel #20
0
from SmartAnno.utils.ConfigReader import ConfigReader
from SmartAnno.db.ORMs import Filter
from SmartAnno.gui.Workflow import Workflow
from SmartAnno.utils.AnnotationTypeDef import AnnotationTypeDef
from SmartAnno.utils.IntroStep import IntroStep
from SmartAnno.utils.KeywordsFiltering import KeywordsFiltering
from SmartAnno.utils.KeywordsUMLSExtender import KeywordsUMLSExtender
from SmartAnno.utils.KeywordsUMLSExtenderSetup import KeywordsUMLSExtenderSetup

logging.getLogger().setLevel(logging.DEBUG)

ConfigReader('../conf/smartanno_conf.json')

wf = Workflow(config_file=ConfigReader.config_file)
wf.api_key = ConfigReader.getValue("api_key")
wf.dao = Dao('sqlite+pysqlite:///../data/test.sqlite', sqlalchemy_dao.POOL_DISABLED)
wf.task_name = 'language'
wf.append(AnnotationTypeDef(
    '<h3>Annotation types:</h3><p>List all the types you want to identify below. Each type per line.<br/>If you'
    'have too many types, try set up them separately, so that you won&apos;t need to choose from a long list '
    'for each sample. </p>', name='types'))
wf.append(KeywordsFiltering(
    name='keywords'))
wf.append(KeywordsUMLSExtenderSetup(name='umls_extender_setup'))
wf.append(KeywordsUMLSExtender(name='umls_extender', sources=ConfigReader.getValue("umls/sources"),
                               filter_by_length=ConfigReader.getValue("umls/filter_by_length"),
                               filter_by_contains=ConfigReader.getValue("umls/filter_by_contains"),
                               max_query=ConfigReader.getValue("umls/max_query")))
wf.append(
    IntroStep('<h2>Welcome to SmartAnno!</h2><h4>First, let&apos;s import txt data from a directory. </h4>',
Beispiel #21
0
 def pyRuSHSplitter(self, text):
     rush = RuSH(ConfigReader.getValue('rush_rules_path'))
     sentences = rush.segToSentenceSpans(text)
     return [
         text[sentence.begin:sentence.end].strip() for sentence in sentences
     ]