def analyze_text(corpus_id, text, type, n_archs): features = Features( concepts=ConceptsOptions(), entities=EntitiesOptions(), keywords=KeywordsOptions(), ) authenticator = IAMAuthenticator( current_app.config['NATURAL_LANGUAGE_UNDERSTANDING_IAM_APIKEY'] ) service = NaLaUn( version=current_app.config['NATURAL_LANGUAGE_UNDERSTANDING_VERSION'], authenticator=authenticator) service.set_service_url( current_app.config['NATURAL_LANGUAGE_UNDERSTANDING_URL'] ) response = service.analyze( text=text, features=features ) results = {} typ_list = ['entities', 'concepts', 'keywords'] for typ in typ_list: results[typ] = pd.DataFrame(response.result[typ]) test_vec = \ results['concepts'].set_index('text')[['relevance']].apply(norm_dot) archetypes = get_corpus_archetypes(corpus_id, type=type, n_archs=n_archs) # Select the subset of features in corpus that cover the test vector. in_common = list(set(test_vec.index).intersection( set(archetypes.fn.columns) )) similarities = ( (archetypes.fn[in_common] @ test_vec.loc[in_common]) * 100 ).applymap(int) similarities.columns = ['similarity %'] test_vec_expanded = pd.DataFrame( test_vec, index=archetypes.f.columns ).apply(scale).fillna(-0.1) compare = archetypes.f.T.apply(scale) compare['DOC'] = test_vec_expanded.apply(scale) archetype_maps = [] for ix in archetypes.f.index: cmp = compare.sort_values(by=ix, ascending=True)[[ix, 'DOC']] cmp = cmp[cmp[ix] > 0.1] archetype_maps.append(cmp.applymap(np.sqrt)) return similarities, archetype_maps
def analyze_corpus(app, name, directory): features = Features( concepts=ConceptsOptions(), entities=EntitiesOptions(), keywords=KeywordsOptions(), ) with app.app_context(): authenticator = IAMAuthenticator( app.config['NATURAL_LANGUAGE_UNDERSTANDING_IAM_APIKEY']) service = NaLaUn( version=app.config['NATURAL_LANGUAGE_UNDERSTANDING_VERSION'], authenticator=authenticator) service.set_service_url( app.config['NATURAL_LANGUAGE_UNDERSTANDING_URL']) filenames = os.listdir(directory) new_corpus = Corpus(name=name, status='processing') db.session.add(new_corpus) db.session.commit() db.session.flush() print('Analyzing corpus in thread. Corpus ID: ' + str(new_corpus.id)) count = 0 for file in filenames: path = os.path.join(directory, file) if not os.path.isfile(path) or not file.endswith('.txt'): continue with open(path) as f: for i in range(3): try: results = service.analyze(text=f.read(), features=features) pickled_results = pickle.dumps(results) new_results = CorpusResult(corpus_id=new_corpus.id, name=file.replace( '.txt', ''), data=pickled_results) db.session.add(new_results) db.session.commit() count += 1 print('Processed file #{}: {} '.format(count, file)) except Exception as e: print(e) time.sleep(0.5) print('Retrying...') else: break else: print('Failed to analyze a file ({}) after ' + 'multiple attempts.'.format(file)) new_corpus.status = 'ready' db.session.commit() print('Finished analyzing corpus.')