Example #1
0
def test_text_vectorization():
    mongo_dataset = MongoHC('hc', 're0')
    data = [d for d in mongo_dataset.get_all(order_by='id_doc')]
    text = [d['text'] for d in data[1:2]]
    tfidf_vectorizer = TfidfVectorizer(max_df=1,
                                       max_features=200000,
                                       min_df=1,
                                       stop_words='english',
                                       strip_accents='unicode',
                                       use_idf=True,
                                       ngram_range=(1, 1),
                                       norm='l2')
    tfidf_matrix = tfidf_vectorizer.fit_transform(text)
    print tfidf_vectorizer.get_feature_names()
    print tfidf_matrix.data

    indices = np.argsort(tfidf_vectorizer.idf_)[::-1]
    print indices
    features = tfidf_vectorizer.get_feature_names()
    top_n = 5
    top_features = [features[i] for i in indices[:top_n]]

    print len(features)
    print tfidf_matrix.shape
    print top_features
def test(db, dataset):
    mongo = MongoHC(db, dataset)
    docs = mongo.get_element_by_id(1114)
    docs = [docs]

    for doc in docs[:1]:
        logfun.info('#' * 80)
        logfun.info('Scanning documents: %(id_doc)s' % doc)
        logfun.info('#' * 80)
        #try:
        entitySet, annotationsSorted, response = getAnnotation(doc['text'])
        '''doc['abstracts'] = []
        for e in entitySet:
            logfun.info('Extracting abstract for entity %s' % e)

            abstract = get_abstract(e)
            if abstract:
                doc['abstracts'].append(abstract)
            else:
                logfun.warning('Abstract not found!')
            logfun.info('-' * 80)

        doc['entity_set'] = list(entitySet)'''

        pp.pprint(response)
def test(db, dataset):
    mongo = MongoHC(db, dataset)
    docs = mongo.get_element_by_id(1114)
    docs = [docs]

    for doc in docs[:1]:
        logfun.info('#' * 80)
        logfun.info('Scanning documents: %(id_doc)s' % doc)
        logfun.info('#' * 80)
        #try:
        entitySet,annotationsSorted,response = getAnnotation(doc['text'])
        '''doc['abstracts'] = []
        for e in entitySet:
            logfun.info('Extracting abstract for entity %s' % e)

            abstract = get_abstract(e)
            if abstract:
                doc['abstracts'].append(abstract)
            else:
                logfun.warning('Abstract not found!')
            logfun.info('-' * 80)

        doc['entity_set'] = list(entitySet)'''

        pp.pprint(response)
Example #4
0
def test_text_vectorization():
    mongo_dataset = MongoHC("hc", "re0")
    data = [d for d in mongo_dataset.get_all(order_by="id_doc")]
    text = [d["text"] for d in data[1:2]]
    tfidf_vectorizer = TfidfVectorizer(
        max_df=1,
        max_features=200000,
        min_df=1,
        stop_words="english",
        strip_accents="unicode",
        use_idf=True,
        ngram_range=(1, 1),
        norm="l2",
    )
    tfidf_matrix = tfidf_vectorizer.fit_transform(text)
    print tfidf_vectorizer.get_feature_names()
    print tfidf_matrix.data

    indices = np.argsort(tfidf_vectorizer.idf_)[::-1]
    print indices
    features = tfidf_vectorizer.get_feature_names()
    top_n = 5
    top_features = [features[i] for i in indices[:top_n]]

    print len(features)
    print tfidf_matrix.shape
    print top_features
def extract_entity(db, dataset):
    mongo_from = MongoHC(db, dataset + '_for_alchemy')
    mongo_to = MongoHC(db, dataset)

    docs = mongo_from.get_all(order_by='id_doc')
    docs = [doc for doc in docs]

    for doc in docs[:]:
        logfun.info('#' * 80)
        logfun.info('Scanning documents: %(id_doc)s' % doc)
        logfun.info('#' * 80)
        try:
            entitySet, annotationsSorted, response = getAnnotation(doc['text'])
            doc['abstracts'] = []
            doc['alchemy_response'] = response
            for e in entitySet:
                logfun.info('Extracting abstract for entity %s' % e)

                abstract = get_abstract(e)
                if abstract:
                    doc['abstracts'].append(abstract)
                else:
                    logfun.warning('Abstract not found!')
                logfun.info('-' * 80)

            doc['entity_set'] = list(entitySet)
            mongo_to.save_document(doc)
            mongo_from.remove_document_by_id(doc['id_doc'])
        except Exception, e:
            logfun.error("Something awful happened!")
            logfun.error(e)
            logfun.error(sys.exc_info()[2])
def extract_abstract_dandelion(db, dataset):
    mongo = MongoHC(db, dataset)
    mongo_dbpedia = MongoHC(db, 'dbpedia')
    docs = [doc for doc in mongo.get_all(order_by='id_doc')]

    for doc in docs:
        try:
            entities = [
                e['lod']['dbpedia'] for e in doc['dandelion']['annotations']
            ]
            for e in entities:
                if mongo_dbpedia.get_element_by_mongo_id(e):
                    logfun.info('Entities already in database')
                    continue
                dbpedia = {}
                logfun.info('Extracting abstract for entity %s' % e)
                abstract = get_abstract(e)
                if abstract:
                    dbpedia['_id'] = e
                    dbpedia['abstract'] = abstract
                    mongo_dbpedia.save_document(dbpedia)
                else:
                    logfun.warning('Abstract not found!')
                logfun.info('-' * 80)
        except Exception, e:
            logfun.error("Something awful happened!")
            logfun.error(e)
            logfun.error(sys.exc_info()[2])
def extract_alchemy(db, dataset):
    mongo = MongoHC(db, dataset)

    docs = [doc for doc in mongo.get_doc_with_no_key('alchemy_response')]

    for doc in docs:
        try:
            entitySet,annotationsSorted,response = getAnnotation(doc['text'])
            doc['alchemy_response'] = response
            mongo.save_document(doc)
        except Exception, e:
            logfun.error("Something awful happened!")
            logfun.error(e)
            logfun.error(sys.exc_info()[2])
def extract_alchemy(db, dataset):
    mongo = MongoHC(db, dataset)

    docs = [doc for doc in mongo.get_doc_with_no_key('alchemy_response')]

    for doc in docs:
        try:
            entitySet, annotationsSorted, response = getAnnotation(doc['text'])
            doc['alchemy_response'] = response
            mongo.save_document(doc)
        except Exception, e:
            logfun.error("Something awful happened!")
            logfun.error(e)
            logfun.error(sys.exc_info()[2])
def extract_dandelion(db, dataset):
    mongo = MongoHC(db, dataset)

    docs = [doc for doc in mongo.get_doc_with_no_key('dandelion',
                                                     order_by='id_doc')]

    for doc in docs:
        try:
            dan = get_entities_from_dandelion(doc['text'])
            logfun.info(dan['timestamp'])
            doc['dandelion'] = dan
            mongo.save_document(doc)
        except Exception, e:
            logfun.error(traceback.format_exc())
Example #10
0
def test_fabio(db,
               dataset,
               gamma=0.5,
               ranking_metric='pr',
               lsa=False,
               save=False):
    mongo_result = MongoHC(db, 'test_fabio')
    result = clf.cluster_fabio(db,
                               dataset,
                               gamma=gamma,
                               ranking_metric=ranking_metric,
                               with_lsa=lsa)
    if save:
        mongo_result.save_document(result)
    pp.pprint(result)
def extract_dandelion(db, dataset):
    mongo = MongoHC(db, dataset)

    docs = [
        doc
        for doc in mongo.get_doc_with_no_key('dandelion', order_by='id_doc')
    ]

    for doc in docs:
        try:
            dan = get_entities_from_dandelion(doc['text'])
            logfun.info(dan['timestamp'])
            doc['dandelion'] = dan
            mongo.save_document(doc)
        except Exception, e:
            logfun.error(traceback.format_exc())
def extract_entity(db, dataset):
    mongo_from = MongoHC(db, dataset + '_for_alchemy')
    mongo_to = MongoHC(db, dataset)

    docs = mongo_from.get_all(order_by='id_doc')
    docs = [doc for doc in docs]

    for doc in docs[:]:
        logfun.info('#' * 80)
        logfun.info('Scanning documents: %(id_doc)s' % doc)
        logfun.info('#' * 80)
        try:
            entitySet,annotationsSorted,response = getAnnotation(doc['text'])
            doc['abstracts'] = []
            doc['alchemy_response'] = response
            for e in entitySet:
                logfun.info('Extracting abstract for entity %s' % e)

                abstract = get_abstract(e)
                if abstract:
                    doc['abstracts'].append(abstract)
                else:
                    logfun.warning('Abstract not found!')
                logfun.info('-' * 80)

            doc['entity_set'] = list(entitySet)
            mongo_to.save_document(doc)
            mongo_from.remove_document_by_id(doc['id_doc'])
        except Exception, e:
            logfun.error("Something awful happened!")
            logfun.error(e)
            logfun.error(sys.exc_info()[2])
def entities_distribution(db, dataset):
    mongo = MongoHC(db, dataset)

    data = [doc for doc in mongo.get_all(order_by='id_doc')]

    entities = set()

    for d in data:
        for e in d['alchemy_response']['entities']:
            entities.add(e['text'])

    entities_dict = {e: 0 for i, e in enumerate(entities)}

    for d in data:
        for e in d['alchemy_response']['entities']:
            entities_dict[e['text']] += 1

    return entities_dict, entities
def entities_distribution(db, dataset):
    mongo = MongoHC(db, dataset)

    data = [doc for doc in mongo.get_all(order_by='id_doc')]

    entities = set()

    for d in data:
        for e in d['alchemy_response']['entities']:
            entities.add(e['text'])

    entities_dict = {e: 0 for i, e in enumerate(entities)}

    for d in data:
        for e in d['alchemy_response']['entities']:
            entities_dict[e['text']] += 1

    return entities_dict, entities
def extract_abstract(db, dataset):
    mongo = MongoHC(db, dataset)

    docs = [doc for doc in mongo.get_empty_abstract()]

    for doc in docs:
        try:
            for e in doc['entity_set']:
                logfun.info('Extracting abstract for entity %s' % e)
                abstract = get_abstract(e)
                if abstract:
                    doc['abstracts'].append(abstract)
                else:
                    logfun.warning('Abstract not found!')
                logfun.info('-' * 80)

            mongo.save_document(doc)
        except Exception, e:
            logfun.error("Something awful happened!")
            logfun.error(e)
            logfun.error(sys.exc_info()[2])
def extract_abstract(db, dataset):
    mongo = MongoHC(db, dataset)

    docs = [doc for doc in mongo.get_empty_abstract()]

    for doc in docs:
        try:
            for e in doc['entity_set']:
                logfun.info('Extracting abstract for entity %s' % e)
                abstract = get_abstract(e)
                if abstract:
                    doc['abstracts'].append(abstract)
                else:
                    logfun.warning('Abstract not found!')
                logfun.info('-' * 80)

            mongo.save_document(doc)
        except Exception, e:
            logfun.error("Something awful happened!")
            logfun.error(e)
            logfun.error(sys.exc_info()[2])
def extract_abstract_dandelion(db, dataset):
    mongo = MongoHC(db, dataset)
    mongo_dbpedia = MongoHC(db, 'dbpedia')
    docs = [doc for doc in mongo.get_all(order_by='id_doc')]

    for doc in docs:
        try:
            entities = [e['lod']['dbpedia'] for e in doc['dandelion']['annotations']]
            for e in entities:
                if mongo_dbpedia.get_element_by_mongo_id(e):
                    logfun.info('Entities already in database')
                    continue
                dbpedia = {}
                logfun.info('Extracting abstract for entity %s' % e)
                abstract = get_abstract(e)
                if abstract:
                  dbpedia['_id'] = e
                  dbpedia['abstract'] = abstract
                  mongo_dbpedia.save_document(dbpedia)
                else:
                    logfun.warning('Abstract not found!')
                logfun.info('-' * 80)
        except Exception, e:
            logfun.error("Something awful happened!")
            logfun.error(e)
            logfun.error(sys.exc_info()[2])
Example #18
0
__author__ = 'biagio'

from mongo_hc import MongoHC
import classifier as clf
import pprint as pp
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from text_utils import TextUtils
import argparse

mongo = MongoHC('hc', 'test_new')


def first_test_re0():
    for i, g in enumerate(np.arange(0.1, 1, 0.01)):
        result = clf.cluster_alchemy('re1', gamma=g, filter=True)
        pp.pprint(result)
        result['n_attempt'] = i + 1
        result['test'] = 'fourth'
        mongo.save_document(result)


def test_without_entity():
    result = clf.cluster_alchemy('re1', gamma=1)
    pp.pprint(result)
    result['test'] = 'baseline'
    mongo.save_document(result)


def test_bow():
    result = clf.scipy_algo('re0')
Example #19
0
def test_fabio(db, dataset, gamma=0.5, ranking_metric="pr", lsa=False, save=False):
    mongo_result = MongoHC(db, "test_fabio")
    result = clf.cluster_fabio(db, dataset, gamma=gamma, ranking_metric=ranking_metric, with_lsa=lsa)
    if save:
        mongo_result.save_document(result)
    pp.pprint(result)