def extract_abstract_dandelion(db, dataset): mongo = MongoHC(db, dataset) mongo_dbpedia = MongoHC(db, 'dbpedia') docs = [doc for doc in mongo.get_all(order_by='id_doc')] for doc in docs: try: entities = [ e['lod']['dbpedia'] for e in doc['dandelion']['annotations'] ] for e in entities: if mongo_dbpedia.get_element_by_mongo_id(e): logfun.info('Entities already in database') continue dbpedia = {} logfun.info('Extracting abstract for entity %s' % e) abstract = get_abstract(e) if abstract: dbpedia['_id'] = e dbpedia['abstract'] = abstract mongo_dbpedia.save_document(dbpedia) else: logfun.warning('Abstract not found!') logfun.info('-' * 80) except Exception, e: logfun.error("Something awful happened!") logfun.error(e) logfun.error(sys.exc_info()[2])
def test_text_vectorization(): mongo_dataset = MongoHC('hc', 're0') data = [d for d in mongo_dataset.get_all(order_by='id_doc')] text = [d['text'] for d in data[1:2]] tfidf_vectorizer = TfidfVectorizer(max_df=1, max_features=200000, min_df=1, stop_words='english', strip_accents='unicode', use_idf=True, ngram_range=(1, 1), norm='l2') tfidf_matrix = tfidf_vectorizer.fit_transform(text) print tfidf_vectorizer.get_feature_names() print tfidf_matrix.data indices = np.argsort(tfidf_vectorizer.idf_)[::-1] print indices features = tfidf_vectorizer.get_feature_names() top_n = 5 top_features = [features[i] for i in indices[:top_n]] print len(features) print tfidf_matrix.shape print top_features
def extract_entity(db, dataset): mongo_from = MongoHC(db, dataset + '_for_alchemy') mongo_to = MongoHC(db, dataset) docs = mongo_from.get_all(order_by='id_doc') docs = [doc for doc in docs] for doc in docs[:]: logfun.info('#' * 80) logfun.info('Scanning documents: %(id_doc)s' % doc) logfun.info('#' * 80) try: entitySet, annotationsSorted, response = getAnnotation(doc['text']) doc['abstracts'] = [] doc['alchemy_response'] = response for e in entitySet: logfun.info('Extracting abstract for entity %s' % e) abstract = get_abstract(e) if abstract: doc['abstracts'].append(abstract) else: logfun.warning('Abstract not found!') logfun.info('-' * 80) doc['entity_set'] = list(entitySet) mongo_to.save_document(doc) mongo_from.remove_document_by_id(doc['id_doc']) except Exception, e: logfun.error("Something awful happened!") logfun.error(e) logfun.error(sys.exc_info()[2])
def test_text_vectorization(): mongo_dataset = MongoHC("hc", "re0") data = [d for d in mongo_dataset.get_all(order_by="id_doc")] text = [d["text"] for d in data[1:2]] tfidf_vectorizer = TfidfVectorizer( max_df=1, max_features=200000, min_df=1, stop_words="english", strip_accents="unicode", use_idf=True, ngram_range=(1, 1), norm="l2", ) tfidf_matrix = tfidf_vectorizer.fit_transform(text) print tfidf_vectorizer.get_feature_names() print tfidf_matrix.data indices = np.argsort(tfidf_vectorizer.idf_)[::-1] print indices features = tfidf_vectorizer.get_feature_names() top_n = 5 top_features = [features[i] for i in indices[:top_n]] print len(features) print tfidf_matrix.shape print top_features
def extract_abstract_dandelion(db, dataset): mongo = MongoHC(db, dataset) mongo_dbpedia = MongoHC(db, 'dbpedia') docs = [doc for doc in mongo.get_all(order_by='id_doc')] for doc in docs: try: entities = [e['lod']['dbpedia'] for e in doc['dandelion']['annotations']] for e in entities: if mongo_dbpedia.get_element_by_mongo_id(e): logfun.info('Entities already in database') continue dbpedia = {} logfun.info('Extracting abstract for entity %s' % e) abstract = get_abstract(e) if abstract: dbpedia['_id'] = e dbpedia['abstract'] = abstract mongo_dbpedia.save_document(dbpedia) else: logfun.warning('Abstract not found!') logfun.info('-' * 80) except Exception, e: logfun.error("Something awful happened!") logfun.error(e) logfun.error(sys.exc_info()[2])
def extract_entity(db, dataset): mongo_from = MongoHC(db, dataset + '_for_alchemy') mongo_to = MongoHC(db, dataset) docs = mongo_from.get_all(order_by='id_doc') docs = [doc for doc in docs] for doc in docs[:]: logfun.info('#' * 80) logfun.info('Scanning documents: %(id_doc)s' % doc) logfun.info('#' * 80) try: entitySet,annotationsSorted,response = getAnnotation(doc['text']) doc['abstracts'] = [] doc['alchemy_response'] = response for e in entitySet: logfun.info('Extracting abstract for entity %s' % e) abstract = get_abstract(e) if abstract: doc['abstracts'].append(abstract) else: logfun.warning('Abstract not found!') logfun.info('-' * 80) doc['entity_set'] = list(entitySet) mongo_to.save_document(doc) mongo_from.remove_document_by_id(doc['id_doc']) except Exception, e: logfun.error("Something awful happened!") logfun.error(e) logfun.error(sys.exc_info()[2])
def entities_distribution(db, dataset): mongo = MongoHC(db, dataset) data = [doc for doc in mongo.get_all(order_by='id_doc')] entities = set() for d in data: for e in d['alchemy_response']['entities']: entities.add(e['text']) entities_dict = {e: 0 for i, e in enumerate(entities)} for d in data: for e in d['alchemy_response']['entities']: entities_dict[e['text']] += 1 return entities_dict, entities