def extract_abstract_dandelion(db, dataset):
    mongo = MongoHC(db, dataset)
    mongo_dbpedia = MongoHC(db, 'dbpedia')
    docs = [doc for doc in mongo.get_all(order_by='id_doc')]

    for doc in docs:
        try:
            entities = [
                e['lod']['dbpedia'] for e in doc['dandelion']['annotations']
            ]
            for e in entities:
                if mongo_dbpedia.get_element_by_mongo_id(e):
                    logfun.info('Entities already in database')
                    continue
                dbpedia = {}
                logfun.info('Extracting abstract for entity %s' % e)
                abstract = get_abstract(e)
                if abstract:
                    dbpedia['_id'] = e
                    dbpedia['abstract'] = abstract
                    mongo_dbpedia.save_document(dbpedia)
                else:
                    logfun.warning('Abstract not found!')
                logfun.info('-' * 80)
        except Exception, e:
            logfun.error("Something awful happened!")
            logfun.error(e)
            logfun.error(sys.exc_info()[2])
Exemple #2
0
def test_text_vectorization():
    mongo_dataset = MongoHC('hc', 're0')
    data = [d for d in mongo_dataset.get_all(order_by='id_doc')]
    text = [d['text'] for d in data[1:2]]
    tfidf_vectorizer = TfidfVectorizer(max_df=1,
                                       max_features=200000,
                                       min_df=1,
                                       stop_words='english',
                                       strip_accents='unicode',
                                       use_idf=True,
                                       ngram_range=(1, 1),
                                       norm='l2')
    tfidf_matrix = tfidf_vectorizer.fit_transform(text)
    print tfidf_vectorizer.get_feature_names()
    print tfidf_matrix.data

    indices = np.argsort(tfidf_vectorizer.idf_)[::-1]
    print indices
    features = tfidf_vectorizer.get_feature_names()
    top_n = 5
    top_features = [features[i] for i in indices[:top_n]]

    print len(features)
    print tfidf_matrix.shape
    print top_features
def extract_entity(db, dataset):
    mongo_from = MongoHC(db, dataset + '_for_alchemy')
    mongo_to = MongoHC(db, dataset)

    docs = mongo_from.get_all(order_by='id_doc')
    docs = [doc for doc in docs]

    for doc in docs[:]:
        logfun.info('#' * 80)
        logfun.info('Scanning documents: %(id_doc)s' % doc)
        logfun.info('#' * 80)
        try:
            entitySet, annotationsSorted, response = getAnnotation(doc['text'])
            doc['abstracts'] = []
            doc['alchemy_response'] = response
            for e in entitySet:
                logfun.info('Extracting abstract for entity %s' % e)

                abstract = get_abstract(e)
                if abstract:
                    doc['abstracts'].append(abstract)
                else:
                    logfun.warning('Abstract not found!')
                logfun.info('-' * 80)

            doc['entity_set'] = list(entitySet)
            mongo_to.save_document(doc)
            mongo_from.remove_document_by_id(doc['id_doc'])
        except Exception, e:
            logfun.error("Something awful happened!")
            logfun.error(e)
            logfun.error(sys.exc_info()[2])
Exemple #4
0
def test_text_vectorization():
    mongo_dataset = MongoHC("hc", "re0")
    data = [d for d in mongo_dataset.get_all(order_by="id_doc")]
    text = [d["text"] for d in data[1:2]]
    tfidf_vectorizer = TfidfVectorizer(
        max_df=1,
        max_features=200000,
        min_df=1,
        stop_words="english",
        strip_accents="unicode",
        use_idf=True,
        ngram_range=(1, 1),
        norm="l2",
    )
    tfidf_matrix = tfidf_vectorizer.fit_transform(text)
    print tfidf_vectorizer.get_feature_names()
    print tfidf_matrix.data

    indices = np.argsort(tfidf_vectorizer.idf_)[::-1]
    print indices
    features = tfidf_vectorizer.get_feature_names()
    top_n = 5
    top_features = [features[i] for i in indices[:top_n]]

    print len(features)
    print tfidf_matrix.shape
    print top_features
def extract_abstract_dandelion(db, dataset):
    mongo = MongoHC(db, dataset)
    mongo_dbpedia = MongoHC(db, 'dbpedia')
    docs = [doc for doc in mongo.get_all(order_by='id_doc')]

    for doc in docs:
        try:
            entities = [e['lod']['dbpedia'] for e in doc['dandelion']['annotations']]
            for e in entities:
                if mongo_dbpedia.get_element_by_mongo_id(e):
                    logfun.info('Entities already in database')
                    continue
                dbpedia = {}
                logfun.info('Extracting abstract for entity %s' % e)
                abstract = get_abstract(e)
                if abstract:
                  dbpedia['_id'] = e
                  dbpedia['abstract'] = abstract
                  mongo_dbpedia.save_document(dbpedia)
                else:
                    logfun.warning('Abstract not found!')
                logfun.info('-' * 80)
        except Exception, e:
            logfun.error("Something awful happened!")
            logfun.error(e)
            logfun.error(sys.exc_info()[2])
def extract_entity(db, dataset):
    mongo_from = MongoHC(db, dataset + '_for_alchemy')
    mongo_to = MongoHC(db, dataset)

    docs = mongo_from.get_all(order_by='id_doc')
    docs = [doc for doc in docs]

    for doc in docs[:]:
        logfun.info('#' * 80)
        logfun.info('Scanning documents: %(id_doc)s' % doc)
        logfun.info('#' * 80)
        try:
            entitySet,annotationsSorted,response = getAnnotation(doc['text'])
            doc['abstracts'] = []
            doc['alchemy_response'] = response
            for e in entitySet:
                logfun.info('Extracting abstract for entity %s' % e)

                abstract = get_abstract(e)
                if abstract:
                    doc['abstracts'].append(abstract)
                else:
                    logfun.warning('Abstract not found!')
                logfun.info('-' * 80)

            doc['entity_set'] = list(entitySet)
            mongo_to.save_document(doc)
            mongo_from.remove_document_by_id(doc['id_doc'])
        except Exception, e:
            logfun.error("Something awful happened!")
            logfun.error(e)
            logfun.error(sys.exc_info()[2])
def entities_distribution(db, dataset):
    mongo = MongoHC(db, dataset)

    data = [doc for doc in mongo.get_all(order_by='id_doc')]

    entities = set()

    for d in data:
        for e in d['alchemy_response']['entities']:
            entities.add(e['text'])

    entities_dict = {e: 0 for i, e in enumerate(entities)}

    for d in data:
        for e in d['alchemy_response']['entities']:
            entities_dict[e['text']] += 1

    return entities_dict, entities
def entities_distribution(db, dataset):
    mongo = MongoHC(db, dataset)

    data = [doc for doc in mongo.get_all(order_by='id_doc')]

    entities = set()

    for d in data:
        for e in d['alchemy_response']['entities']:
            entities.add(e['text'])

    entities_dict = {e: 0 for i, e in enumerate(entities)}

    for d in data:
        for e in d['alchemy_response']['entities']:
            entities_dict[e['text']] += 1

    return entities_dict, entities