Example #1
0
def relevancy_dict(chunk):
    service = NLU(
        version='2018-03-16',
        url=
        'https://gateway.watsonplatform.net/natural-language-understanding/api',
        iam_apikey='#####')
    response = service.analyze(text=chunk,
                               features=Features(
                                   entities=EntitiesOptions(),
                                   keywords=KeywordsOptions())).get_result()
    analysis = json.dumps(response, indent=2)
    return json.loads(analysis)
Example #2
0
def analyze_text(corpus_id, text, type, n_archs):
    features = Features(
        concepts=ConceptsOptions(),
        entities=EntitiesOptions(),
        keywords=KeywordsOptions(),
    )
    authenticator = IAMAuthenticator(
        current_app.config['NATURAL_LANGUAGE_UNDERSTANDING_IAM_APIKEY']
    )
    service = NaLaUn(
        version=current_app.config['NATURAL_LANGUAGE_UNDERSTANDING_VERSION'],
        authenticator=authenticator)
    service.set_service_url(
        current_app.config['NATURAL_LANGUAGE_UNDERSTANDING_URL']
    )
    response = service.analyze(
        text=text,
        features=features
    )
    results = {}
    typ_list = ['entities', 'concepts', 'keywords']
    for typ in typ_list:
        results[typ] = pd.DataFrame(response.result[typ])

    test_vec = \
        results['concepts'].set_index('text')[['relevance']].apply(norm_dot)
    archetypes = get_corpus_archetypes(corpus_id, type=type, n_archs=n_archs)

    # Select the subset of features in corpus that cover the test vector.
    in_common = list(set(test_vec.index).intersection(
        set(archetypes.fn.columns)
    ))

    similarities = (
        (archetypes.fn[in_common] @ test_vec.loc[in_common]) * 100
    ).applymap(int)
    similarities.columns = ['similarity %']

    test_vec_expanded = pd.DataFrame(
        test_vec,
        index=archetypes.f.columns
    ).apply(scale).fillna(-0.1)

    compare = archetypes.f.T.apply(scale)
    compare['DOC'] = test_vec_expanded.apply(scale)

    archetype_maps = []
    for ix in archetypes.f.index:
        cmp = compare.sort_values(by=ix, ascending=True)[[ix, 'DOC']]
        cmp = cmp[cmp[ix] > 0.1]
        archetype_maps.append(cmp.applymap(np.sqrt))

    return similarities, archetype_maps
Example #3
0
def analyze_corpus(app, name, directory):
    features = Features(
        concepts=ConceptsOptions(),
        entities=EntitiesOptions(),
        keywords=KeywordsOptions(),
    )
    with app.app_context():
        authenticator = IAMAuthenticator(
            app.config['NATURAL_LANGUAGE_UNDERSTANDING_IAM_APIKEY'])
        service = NaLaUn(
            version=app.config['NATURAL_LANGUAGE_UNDERSTANDING_VERSION'],
            authenticator=authenticator)
        service.set_service_url(
            app.config['NATURAL_LANGUAGE_UNDERSTANDING_URL'])

        filenames = os.listdir(directory)
        new_corpus = Corpus(name=name, status='processing')
        db.session.add(new_corpus)
        db.session.commit()
        db.session.flush()
        print('Analyzing corpus in thread. Corpus ID: ' + str(new_corpus.id))
        count = 0
        for file in filenames:
            path = os.path.join(directory, file)
            if not os.path.isfile(path) or not file.endswith('.txt'):
                continue
            with open(path) as f:
                for i in range(3):
                    try:
                        results = service.analyze(text=f.read(),
                                                  features=features)
                        pickled_results = pickle.dumps(results)
                        new_results = CorpusResult(corpus_id=new_corpus.id,
                                                   name=file.replace(
                                                       '.txt', ''),
                                                   data=pickled_results)
                        db.session.add(new_results)
                        db.session.commit()
                        count += 1
                        print('Processed file #{}: {} '.format(count, file))
                    except Exception as e:
                        print(e)
                        time.sleep(0.5)
                        print('Retrying...')
                    else:
                        break
                else:
                    print('Failed to analyze a file ({}) after ' +
                          'multiple attempts.'.format(file))

        new_corpus.status = 'ready'
        db.session.commit()
        print('Finished analyzing corpus.')
Example #4
0
def call_nlu_with_retry(doc_html: str,
                        natural_language_understanding: ibm_watson.
                        NaturalLanguageUnderstandingV1, extract_entities: bool,
                        extract_semantic_roles: bool) -> Any:
    """
    Pass a document through Natural Language Understanding, performing the 
    analyses we need for the current use case.
    
    Also handles retrying with exponential backoff.
    
    :param doc_html: HTML contents of the web page
    :param nlu: Preinitialized instance of the NLU Python API
    :returns: Python object encapsulating the parsed JSON response from the web service.
    """
    if extract_entities and extract_semantic_roles:
        nlu_features = nlu.Features(
            entities=nlu.EntitiesOptions(mentions=True),
            semantic_roles=nlu.SemanticRolesOptions())
    elif extract_entities and not extract_semantic_roles:
        nlu_features = nlu.Features(entities=nlu.EntitiesOptions(
            mentions=True))
    elif not extract_entities and extract_semantic_roles:
        nlu_features = nlu.Features(semantic_roles=nlu.SemanticRolesOptions())
    else:
        raise ValueError("Must run at least one NLU model.")

    num_tries = 0
    MAX_RETRIES = 8
    RATE_LIMIT_ERROR_CODE = 429
    last_exception = None
    while num_tries < MAX_RETRIES:
        num_tries += 1
        try:
            return natural_language_understanding.analyze(
                html=doc_html,
                return_analyzed_text=True,
                features=nlu_features).get_result()
        except ibm_cloud_sdk_core.api_exception.ApiException as e:
            # Retry logic in case we hit the rate limit
            if e.code != RATE_LIMIT_ERROR_CODE:
                raise e
            sleep_time = 2**(num_tries - 1)
            print(
                f"Request failed {num_tries} times; retrying in {sleep_time} sec"
            )
            time.sleep(sleep_time)

    raise Exception(f"Exceeded limit of {MAX_RETRIES} retries.")
Example #5
0
class DocumentArchetypes:
    '''
    DocumentArchetypes performs Archetypal Analysis on a corpus consisting of a set of documents, for example a set 
    of articles, books, news stories or medical dictations.
    
    Input parameters:
    
    PATH            - Dictionary with paths to I/O
    PATH['data']    - Directory for input text files. Example: './data/input_texts/'
    PATH['results'] - Directory for output.           Example: './data/output_nlu/'
    
    NLU                   - Dictionary with information for running Watson NLU
    NLU['apikey']         - apikey for running Watson NLU
    NLU['apiurl']         - URL for Watson NLU API
    NLU['version']        - Watson NLU version, e.g. '2019-07-12'
    NLU['features']       - Features requested from Watson NLU for each document in the set, e.g. 
                                Features(
                                categories= CategoriesOptions(),
                                concepts  = ConceptsOptions(),
                                entities  = EntitiesOptions(),
                                keywords  = KeywordsOptions(),
                                relations = RelationsOptions(),
                                syntax    = SyntaxOptions()
                                )

    Attributes:

        
        self.PATH 
    
        
    '''
    from ibm_watson import NaturalLanguageUnderstandingV1 as NaLaUn
    from ibm_watson.natural_language_understanding_v1 import Features, CategoriesOptions,ConceptsOptions,EntitiesOptions,KeywordsOptions,RelationsOptions,SyntaxOptions
    
    def __init__(self, PATH, NLU):
        self.PATH = PATH
        self.NLU  = NLU
        self.nlu_model  = NaLaUn(version=NLU['version'] , iam_apikey = NLU['apikey'], url = NLU['apiurl'])  #Local Natural Language Understanding object
        self.archetypes_dic = {}
        
        ################
        ## PREPARE DATA 
        ################
        self.filenames = os.listdir(self.PATH['data']) 
        self.dictation_dic = {}            #dictionary for dictation files
        for name in self.filenames:
            self.dictation_dic[name.replace('.txt','')] = open(self.PATH['data']+name).read()
        
        ###############################
        ## PERFORM WATSON NLU ANALYSIS
        ###############################
        
        self.watson = {}    #Dictionary with Watson-NLU results for each dictation
        
        self.watson_pkl = PATH['results']+'all_dictations_nlu.pkl'  
        pkl_exists = os.path.exists(self.watson_pkl)

        if pkl_exists:
            self.watson = pickle.load( open( self.watson_pkl, "rb" ) )

        else: #perform nlu-analysis on dictations
            for item in list(self.dictation_dic.items()):
                lbl  = item[0]
                text = item[1]
                self.watson[lbl] = self.nlu_model.analyze(text = text, features=NLU['features'])
                f = open(PATH['results']+str(lbl)+'_nlu.pkl','wb')
                pickle.dump(self.watson[lbl],f)
                f.close()

            f = open(self.watson_pkl,'wb')
            pickle.dump(self.watson,f)
            f.close() 

        # Copy Watson NLU results to Pandas Dataframes
        self.watson_nlu = {}
        for dctn in self.watson.items():
            self.watson_nlu[dctn[0]] = {}
            for item in list(dctn[1].result.items()):
                self.watson_nlu[dctn[0]][item[0]]=pd.DataFrame(list(item[1]))


    ##############
    # ARCHETYPAL ANALYSIS
    ##############

    def archetypes(self,typ='entities',n_archs=6,bootstrap = False, bootstrap_frac = 0.5):
        hyperparam = (n_archs,bootstrap,bootstrap_frac)
        if typ not in self.archetypes_dic.keys():
            self.archetypes_dic[typ] = {}
        if hyperparam not in self.archetypes_dic[typ].keys():
            self.archetypes_dic[typ][hyperparam] = {}
            df = pd.DataFrame()
            for key in self.watson_nlu:
                dfx = self.watson_nlu[key][typ].copy()
                dfx['dictation'] = key
                df = df.append(dfx,sort=True)
            if typ is 'entities':
                df = df[df['type']=='HealthCondition']
                df.rename({'relevance': 'rel0'}, axis=1,inplace=True)
                df['relevance'] = df['rel0'] * df['confidence']
            mat = df.pivot_table(index='dictation',columns='text',values='relevance').fillna(0)
            self.archetypes_dic[typ][hyperparam] = Archetypes(mat,n_archs,bootstrap = bootstrap, bootstrap_frac = bootstrap_frac)
        return self.archetypes_dic[typ][hyperparam]


    def display_archetype(self,typ = 'entities' , n_archs = 6, arch_nr = 0, var = 'variables', threshold = 0.1):
        if var is 'variables':
            arc = self.archetypes(typ,n_archs).f.T.sort_values(by=arch_nr,ascending = False)
            result = arc[
                        arc[arch_nr] >= (threshold * arc[arch_nr][0])
                     ]
            return result
        elif var is 'dictations':
            arc = sns.clustermap(archetypes(typ,n_archs).o).data2d
            return arc
Example #6
0
dictation_analysis = {}
dian = dictation_analysis

# If dictation_analysis dictionary already exists - read the pickled file
# If it does NOT already exist, perform calculations. 
dian_pkl_file = PATH['results']+'all_dictations_nlu.pkl'  
dian_pkl_exists = os.path.exists(dian_pkl_file)

if dian_pkl_exists:
    dian = pickle.load( open( dian_pkl_file, "rb" ) )

else: #perform nlu-analysis on dictations
    for item in list(dictation_dic.items()):
        lbl  = item[0]
        text = item[1]
        dian[lbl] = nlu.analyze(text = text, features=NLU['features'])
        f = open(PATH['results']+str(lbl)+'_nlu.pkl','wb')
        pickle.dump(dian[lbl],f)
        f.close()

    f = open(dian_pkl_file,'wb')
    pickle.dump(dian,f)
    f.close()  

# Transform dian to Pandas Dataframes
df_dic = {}
for dctn in dian.items():
    df_dic[dctn[0]] = {}
    for item in list(dctn[1].result.items()):
        df_dic[dctn[0]][item[0]]=pd.DataFrame(list(item[1]))
Example #7
0
class WatsonDocumentArchetypes:
    '''
    WatsonDocumentArchetypes performs Archetypal Analysis on a corpus consisting of a set of documents, for example a set 
    of articles, books, news stories or medical dictations.
    
    Input parameters:
    
    PATH            - Dictionary with paths to I/O
    PATH['data']    - Directory for input text files. Example: './data/input_texts/'
    PATH['results'] - Directory for output.           Example: './data/output_nlu/'
    
    NLU                   - Dictionary with information for running Watson NLU
    NLU['apikey']         - apikey for running Watson NLU
    NLU['apiurl']         - URL for Watson NLU API
    NLU['version']        - Watson NLU version, e.g. '2019-07-12'
    NLU['features']       - Features requested from Watson NLU for each document in the set, e.g. 
                                Features(
                                categories= CategoriesOptions(),
                                concepts  = ConceptsOptions(),
                                entities  = EntitiesOptions(),
                                keywords  = KeywordsOptions(),
                                relations = RelationsOptions(),
                                syntax    = SyntaxOptions()
                                )

    Attributes:

        
        self.PATH 
    
        
    '''
    from ibm_watson import NaturalLanguageUnderstandingV1 as NaLaUn
    from ibm_watson.natural_language_understanding_v1 import Features, CategoriesOptions,ConceptsOptions,EntitiesOptions,KeywordsOptions,RelationsOptions,SyntaxOptions
    
    def __init__(self, PATH, NLU, 
                 train_test = False):
        
        self.PATH       = PATH
        self.NLU        = NLU
        # To random partition documents into train/test-sets, 
        # choose relative size of test-set, train_test (1 = 100%)
        self.train_test = train_test  
        
        self.nlu_model  = NaLaUn(version=NLU['version'] , iam_apikey = NLU['apikey'], url = NLU['apiurl'])  #Local Natural Language Understanding object
            # Initiate X_matrix dictionaries
        self.X_matrix_dic = {}
        self.X_matrix_train_dic = {}
        self.X_matrix_test_dic  = {}
        self.archetypes_dic = {} 
 
        ################
        ## PREPARE DATA 
        ################
        self.filenames = ls(self.PATH['data']+'*.txt', name_only=True)  # all filenames ending with '.txt' 
        self.names     = [name.replace('.txt','') for name in self.filenames]
        self.all_names = self.names *1      # if train_test - self.names will be set to self.names_train
        self.dictation_dic = {}             # dictionary for dictation files
        for name in self.filenames:
            self.dictation_dic[name.replace('.txt','')] = open(self.PATH['data']+name, encoding="utf-8").read()
        self.dictation_df = pd.Series(self.dictation_dic)
            
        ####################
        ## TRAIN-TEST SPLIT 
        ####################
        if self.train_test: # 0<train_test<1 - the proportion of names to save as 'test (rounded downwards)
            self.names_test , self.names_train = random_split(self.all_names , self.train_test)
            self.names = self.names_train

        ###############################
        ## PERFORM WATSON NLU ANALYSIS
        ###############################
        
        self.watson = {}    #Dictionary with Watson-NLU results for each dictation
        
        self.watson_pkl = PATH['results']+'all_dictations_nlu.pkl'  
        pkl_exists = os.path.exists(self.watson_pkl)

        if pkl_exists:
            self.watson = pickle.load( open( self.watson_pkl, "rb" ) )

        else: #perform nlu-analysis on dictations
            for item in list(self.dictation_dic.items()):
                lbl  = item[0]
                text = item[1]
                self.watson[lbl] = self.nlu_model.analyze(text = text, features=NLU['features'])
                f = open(PATH['results']+str(lbl)+'_nlu.pkl','wb')
                pickle.dump(self.watson[lbl],f)
                f.close()

            f = open(self.watson_pkl,'wb')
            pickle.dump(self.watson,f)
            f.close() 

        # Copy Watson NLU results to Pandas Dataframes
        self.watson_nlu = {}
        for dctn in self.watson.items():
            self.watson_nlu[dctn[0]] = {}
            for item in list(dctn[1].result.items()):
                self.watson_nlu[dctn[0]][item[0]]=pd.DataFrame(list(item[1]))


    ##############
    # ARCHETYPAL ANALYSIS
    ##############

    # CONSTRUCT X- MATRIX
    def X_matrix(self,typ = 'entities'):
        '''
        Construct the archetypal analysis X-matrix by pivoting the dataframe in the 
        dictionary my_wda.watson_nlu that contains the Watson NLU analysis in question
        
        X_matrix(typ)
            rows   : Dictations 
            columns: Variables; keywords/entities/concepts, from Watson NLU analysis
            values : Weights, from Watson NLU analysis
        
        the constructed X_matrix(typ) is saved as X_matrix_dic[typ]
        
        if my_wda.train_test has a value (not False) X_matrix_train_dic[typ] and X_matrix_test[typ]
        are added computed and added to their respective dicionaries
        '''
        if typ not in self.X_matrix_dic.keys():
            df = pd.DataFrame()
            for key in self.names:
                dfx = self.watson_nlu[key][typ].copy()
                dfx['dictation'] = key
                df = df.append(dfx,sort=True)
            if typ is 'entities':
                df = df[df['type']=='HealthCondition']
                df.rename({'relevance': 'rel0'}, axis=1,inplace=True)
                df['relevance'] = df['rel0'] * df['confidence']
            self.X_matrix_dic[typ] = df.pivot_table(index='dictation',columns='text',values='relevance').fillna(0)
        
        if self.train_test:
            self.X_matrix_train_dic[typ] = self.X_matrix_dic[typ]
            
            df = pd.DataFrame()
            for key in self.names_test:
                dfx = self.watson_nlu[key][typ].copy()
                dfx['dictation'] = key
                df = df.append(dfx,sort=True)
            if typ is 'entities':
                df = df[df['type']=='HealthCondition']
                df.rename({'relevance': 'rel0'}, axis=1,inplace=True)
                df['relevance'] = df['rel0'] * df['confidence']
            self.X_matrix_test_dic[typ] = df.pivot_table(index='dictation',columns='text',values='relevance').fillna(0)
        return self.X_matrix_dic[typ]

    # CALCULATE ARCHETYPES
    def archetypes(self,typ='entities',n_archs=6,bootstrap = False, bootstrap_frac = 0.5):
        if typ not in self.archetypes_dic.keys():
            self.archetypes_dic[typ] = {}
        hyperparam = (n_archs,bootstrap,bootstrap_frac)
        self.X_matrix(typ)
        self.archetypes_dic[typ][hyperparam] = Archetypes(self.X_matrix(typ),n_archs,bootstrap = bootstrap, bootstrap_frac = bootstrap_frac)
        return self.archetypes_dic[typ][hyperparam]


    def display_archetype(self,arch_nr = -1, typ = 'entities' , n_archs = 6, var = 'variables', threshold = 0.1, norm = scale):
        fun = {'variables' : 'self.archetypes(typ = typ,n_archs = n_archs).f.T ',
               'dictations': 'self.archetypes(typ = typ,n_archs = n_archs).o'
               }
        f  = eval(fun[var])
        fn = f.apply(norm)
        if arch_nr == -1:
            return sns.clustermap(f).data2d
        else:
            arc = f.sort_values(by=arch_nr,ascending = False)
            result = arc[
                        arc[arch_nr] >= (threshold * arc[arch_nr][0])
                        ]
            return result