Example #1
0
def get_calais_subjects(text, uid):
    registry = getUtility(IRegistry)
    settings = registry.forInterface(ITagHelperSettingsSchema)
    api_key = settings.calais_api_key
    relevance = settings.calais_relevance
    subjects=[]
    if api_key:
        calais = Calais(api_key)
        try:
            result = calais.analyze(text, external_id = uid)
        except:
            return []
        #if hasattr( result, 'entities'):
        #    for entity in result.entities:
        #        if entity['_type'] in PREFERRED_ENTITIES:
        #            subjects.append(entity['name'])
        if hasattr( result, 'socialTag'):
            for tag in result.socialTag:
                if float(tag['importance']) > relevance:
                    subjects.append(tag['name'])
        #if hasattr( result, 'relations'):
        #    for fact in result.relations:
        #        if fact['_type'] in PREFERRED_FACTS:
        #            ft = fact.get(fact['_type'].lower())
        #            if ft:
        #                subjects.append(ft)
    return subjects
Example #2
0
def populate_database_with_semantic_data_from_comments(calais_api_key, db_cursor, debug):

	calais = Calais(calais_api_key, submitter='usermine')

	db_cursor.execute('SELECT id, comment FROM comments WHERE updated = 0')

	for comment_data in db_cursor.fetchall():

		id = comment_data[0]
		comment = comment_data[1]

		try:
			result = calais.analyze(comment)

			if hasattr(result, 'entities'):
				for entity in result.entities:
					entity_name = entity['name']
					db_cursor.execute('INSERT INTO entities (id, entity) VALUES (NULL, ?)', [entity_name])

			if hasattr(result, 'topics'):
				for topic in result.topics:
					topic_name = topic['categoryName']
					db_cursor.execute('INSERT INTO topics (id, topic) VALUES (NULL, ?)', [topic_name])
		except:
			if debug:
				print sys.exc_info()

		db_cursor.execute('UPDATE comments SET updated=1 WHERE id = ?', [id])

		if debug:
			print '.'
Example #3
0
    def process_calais(content, key):
        calais = Calais(key)
        response = calais.analyze(content)

        people = [entity["name"] for entity in getattr(response, "entities", []) if entity["_type"] == "Person"]

        return {"people": people}
Example #4
0
def get_calais_subjects(text, uid):
    registry = getUtility(IRegistry)
    settings = registry.forInterface(ITagHelperSettingsSchema)
    api_key = settings.calais_api_key
    relevance = settings.calais_relevance
    subjects = []
    if api_key:
        calais = Calais(api_key)
        try:
            result = calais.analyze(text, external_id=uid)
        except:
            return []
        # if hasattr( result, 'entities'):
        #    for entity in result.entities:
        #        if entity['_type'] in PREFERRED_ENTITIES:
        #            subjects.append(entity['name'])
        if hasattr(result, "socialTag"):
            for tag in result.socialTag:
                if float(tag["importance"]) > relevance:
                    subjects.append(tag["name"])
        # if hasattr( result, 'relations'):
        #    for fact in result.relations:
        #        if fact['_type'] in PREFERRED_FACTS:
        #            ft = fact.get(fact['_type'].lower())
        #            if ft:
        #                subjects.append(ft)
    return subjects
Example #5
0
def getOpenCalaisResultFromURL(url):
    API_KEY = openCalaisKeys[randint(0,6)]
    calais = Calais(API_KEY, submitter="python-calais demo")
    try:
        result = calais.analyze_url(url)
    except:
        result = None
    return result
Example #6
0
def analyze(request):
    API_KEY = 'kgmykdr862hdfhkzkuchnxkc'
    calais = Calais(API_KEY, 'python')
    try:
        result = calais.analyze(request.POST['content'])
        if result:
            return HttpResponse(json.dumps(result.get_json_entities()),
                                mimetype="application/json")
        else:
            return HttpResponse(json.dumps(""),
                                mimetype="application/json")
    except(KeyError):
        return HttpResponse(json.dumps(""),
                                mimetype="application/json")
Example #7
0
def calaisify_main(options, args):
    m = persistence.MongoPersistence()
    calais = Calais(config.calais["api_key"], submitter="python-calais demo")
    for d in m.get_non_calaised_entries():
        l = short_url.has(d["title"])
        if len(l):
            for url in l:
                long_url = short_url.find_real(url)
                response = calais.analyze_url(long_url)
                response.print_summary()
                response.print_topics()
                response.print_entities()

        else:
            pass
Example #8
0
def contract(request, contract):
    response = {}
    collection = connection()
    blocks = collection.find( { 'contracts':contract}, sort=[('date', ASCENDING)] )
    
    # get last block and display if it was approved or not
   
    calais = Calais(settings.CALAIS, submitter="hello world")
    
    newbloks = []
    for block in blocks:
        block['entities'] = calais.analyze(block['string']).entities
        newbloks.append(block)
        
    response['items'] = newbloks
    response['contract'] = contract
    return render_to_response('contract.html', response)
Example #9
0
def analyze(model, text=None, backend='calais'):
    from calais import Calais
    from django.conf import settings

    calais = Calais(settings.CALAIS_API_KEY, submitter='newscredit')
    if not text:
        _text = model.analysis_text()
    else:
        _text = text

    # we cannot analyse if our total content is under 100 characters
    # after HTML cleaning. We leave OpenCalais to do this as they have
    # advanced heuristics to do it. If our text to analyse is less than
    # 100 characters, we skip the analysis.
    if len( _text ) < 100:
        return

    result = calais.analyze(_text)
    records = []

    try:
        for entity in result.entities:
            if entity['_type'] == 'Person':
                _model = Person
            elif entity['_type'] in [ 'City', 'Country', 'Continent',
                                        'Facility', 'Region' ]:
                _model = Place
            elif entity['_type'] in [ 'Organization', 'Company' ]:
                _model = Organisation
            else:
                continue

            try:
                _record = _model.objects.get(uri=entity['__reference'])
            except _model.DoesNotExist, e:
                _record = _model()
            _record.from_calais(entity)
            _record.save()
            model.add_entity(_record)
            records.append(_record)
    except AttributeError:
        # this happens if Calais throws an error. To ensure we continue
        # processing other records pass this error and return False
        return False
    return result, records
Example #10
0
def analyze(model, text=None, backend='calais'):
  from calais import Calais
  from django.conf import settings
  calais = Calais(settings.CALAIS_API_KEY, submitter='newscredit')
  if text:
    result = calais.analyze(text)
  else:
    result = calais.analyze(model.analysis_text())
  people = []
  for entity in result.entities:
    if entity['_type'] == 'Person':
      try:
        person = Person.objects.get(uri=entity['__reference'])
      except Person.DoesNotExist, e:
        person = Person()
      person.from_calais(entity)
      person.save()
      people.append(person)
Example #11
0
def calais_test(request):
    content_url= request.REQUEST['content_url']
    if content_url == '':
        content_url='http://n-tv.de'
    calais = Calais('pc5v39x8sq3mh4mv9zm2ppre' , submitter="ask-a-sap-question")
    result= calais.analyze_url(content_url)
    result.print_summary()
    c_data= {
        "name"  :   "Barack",
    }
    for entity in result.entities:
        c_data[entity['name']]=entity['_type']
    lang= result.doc['meta']['language']
    return render_to_response('calais_test.html',{ 'text': content_url, 'calais_data': c_data, 'language': lang})
    
    
    
    
Example #12
0
    def __init__(self):
        self.db_name = "pythia_db"

        self.connections = {
            "db":
            connect(self.db_name),
            "calais":
            Calais("av536xwvy4mgmcbw9cancqmd", submitter="pythia-application")
        }
Example #13
0
def main():
    """ Main method """

    if not sys.stdin:
        usage()
        sys.exit(1)

    input = sys.stdin.read() # read from stdin
    input = input.replace("<", "")
    input = input.replace(">", "")

    api_key = config.read_config('calais', 'api_key')
    calais = Calais(api_key, submitter="fridakahlo")
    entities = calais.analyze(input).entities
    
    #for e in entities: print e['name'], len(e['instances']), '<br>'

    linked_text = add_links(entities, input)
    for line in linked_text.splitlines():
        print "<p>", line, "<p>"
Example #14
0
def entities(env, start_response):
    """Extracts entities from resume utilizing the OpenCalais webservice."""

    start_response('200 OK', [('Content-Type', 'text/xml')])
    API_KEY = "kqyhhfppufvmvxspkstwjxw5"
    calais = Calais(API_KEY, submitter="resume_analysis")
    try:
        with open('Darin_Plutchok_Resume_Taxonomist.txt') as f:
            text = f.read()
    except:
        raise restlite.Status, '400 Error Reading File'
    try:
        results = calais.analyze(text)
    except Exception as e:
        return "<error>%s</error>" % e

    entities_tuples = [(entity['name'], entity['_type'], entity['relevance'])
                       for entity in results.entities]
    doc = create_xml({'entities': entities_tuples})

    return [str(doc.toxml())]
def entities(env, start_response):
	"""Extracts entities from resume utilizing the OpenCalais webservice."""

	start_response('200 OK', [('Content-Type', 'text/xml')])
	API_KEY = "kqyhhfppufvmvxspkstwjxw5"
	calais = Calais(API_KEY, submitter="resume_analysis")
	try:
		with open('Darin_Plutchok_Resume_Taxonomist.txt') as f:
			text = f.read()
	except: 
		raise restlite.Status, '400 Error Reading File'
	try:
		results = calais.analyze(text)
	except Exception as e:
		return "<error>%s</error>" % e

	entities_tuples = [(entity['name'], entity['_type'], entity['relevance'])
											  for entity in results.entities]
	doc = create_xml({'entities': entities_tuples})
	
	return [str(doc.toxml())]
Example #16
0
def _get_people(text):
    '''
    Runs input text through Calais to extract people, coreferences and their
    locations.

    This function returns a canonical name for any given source in the document
    and contextual information about where coreferences appear, based on the
    text before and after the pronoun occurrance.

    Takes full story text as input.

    This is a pretty bare-bones function. It doesn't handle Calais API errors, so 
    it tends to crap out from time to time. Future refinements should account for this.
    '''
    # Run input text through Calais
    calais = Calais(API_KEY, submitter="tbc-coref-test")
    annotations = calais.analyze(text)

    # If no entities come back, peace out
    if not hasattr(annotations, 'entities'):
        return False

    coref = {}  # Dictionary to hold our corefence object.
    for e in annotations.entities:
        instances = []
        # We only care about Person entities, not companies, places, etc.
        if e['_type'] == 'Person':
            # For each instance of that entity (which includes pronouns and other references) ...
            for i in e['instances']:
                # Collect the coreference text (exact) the preceding text (prefix) and the
                # following text (suffix) for reference information. We'll need this later.
                instances.append(
                    (i.get('exact'), i.get('suffix', ''), i.get('prefix', '')))
            # Associate the canonical name with the coreference and context information gathered
            # above for use later.
            name = e.get("commonname", e.get('name', None))
            coref[name] = instances
    return coref
Example #17
0
    def __init__(self, content_object, content_fields=None):
        super(OpenCalais, self).__init__(content_object)
        self.calais = Calais(settings.CALAIS_API_KEY,
                             settings.CALAIS_SUBMITTER)

        if content_fields:
            self.calais_content_fields = content_fields
        else:
            try:
                self.calais_content_fields = dict(
                    self.content_object.__class__.calais_content_fields)
            except FieldDoesNotExist, e:
                raise OpenCalaisTagFetchError(
                    'You need to define calais_content_fields: %s' % e)
Example #18
0
def _get_people(text):
    '''
    Runs input text through Calais to extract people, coreferences and their
    locations.

    This function returns a canonical name for any given source in the document
    and contextual information about where coreferences appear, based on the
    text before and after the pronoun occurrance.

    Takes full story text as input.

    This is a pretty bare-bones function. It doesn't handle Calais API errors, so 
    it tends to crap out from time to time. Future refinements should account for this.
    '''
    # Run input text through Calais
    calais = Calais(API_KEY, submitter="tbc-coref-test")
    annotations = calais.analyze(text)

    # If no entities come back, peace out
    if not hasattr(annotations, 'entities'):
        return False

    coref = {} # Dictionary to hold our corefence object.
    for e in annotations.entities:
        instances = []
        # We only care about Person entities, not companies, places, etc.
        if e['_type'] == 'Person':
            # For each instance of that entity (which includes pronouns and other references) ...
            for i in e['instances']:
                # Collect the coreference text (exact) the preceding text (prefix) and the
                # following text (suffix) for reference information. We'll need this later.
                instances.append((i.get('exact'), i.get('suffix', ''), i.get('prefix', '')))
            # Associate the canonical name with the coreference and context information gathered
            # above for use later.
            name = e.get("commonname", e.get('name', None))
            coref[name] = instances
    return coref
Example #19
0
class classifier():
	API_KEY = 'xyby6x47ycxj56bkkb83s9he'

	def __init__(self):
		self.calais = Calais(self.API_KEY, submitter='SocialTV Demo')

	def process(self, ip, text):
		result = self.calais.analyze(text)
		all_tags = []

		if hasattr(result, 'entities'):
			all_tags.extend(result.entities)

		if hasattr(result, 'socialTag'):
	        #all_tags.extend(result.socialTag)
			pass

		if len(all_tags) == 0:
			return []

		output = []
		for tag in all_tags:
			if tag['name'].lower() not in dirtyWords.DIRTY:
				qwiki = self._get_qwiki_url(tag['name'])
				if len(qwiki) > 0:
					img = self._get_google_image(ip, tag['name'])
					print "Tag: "+tag['name']
					output.append({'text':text ,'tag': tag['name'], 'qwiki':qwiki, 'img': img})
		return output

	
	def _get_qwiki_url(self, text):
		try:
			tmp_txt = text.replace(' ', '_')
			response = urllib2.urlopen("http://www.qwiki.com/embed/"+urllib2.quote(tmp_txt)+"?autoplay=true")
			return "http://www.qwiki.com/embed/"+urllib2.quote(tmp_txt)+"?autoplay=true"
	        except urllib2.URLError, e:
			response = urllib2.urlopen("http://embed-api.qwiki.com/api/v1/search.json?count=1&q="+urllib2.quote(text))
			html = response.read()
			html_eval = json.loads(html)
			if len(html_eval) > 0:
				return self._process_qwiki_results(text, html_eval)
			else:
				return []
Example #20
0
class calaisApi:
    def __init__(self):
        self.calais = Calais(KEY, submitter="GodelTest")

    @persistent_memoize
    def calais_run(self, sentence):
        entities = []
        try:
            result = self.calais.analyze(sentence)
        except ValueError:
            return
            
        if hasattr(result, "entities"):
            if len(result.entities) > 0:
                for results in result.entities:
                    entities.append(results)

        if len(entities) > 0:
            return entities
        else:
            return False
def extract_entities(text, retries=5):
    """
    Input: entity_text
    Output: calais entity
    """
    import time
    sys.path.insert(
        0,
        os.path.realpath(
            os.path.abspath(
                os.path.join(
                    os.path.split(inspect.getfile(inspect.currentframe()))[0],
                    "../../ner"))))
    from calais import Calais
    random.seed(text)
    API_KEYS = [
        "wdbkpbpsksskkbm2rpqfm4xa", "mt5qu3e4jdrd6jpc9r9ecama",
        "k9fb7rfh7hpbfp238whuggrr", "55rared7un2pnjr23kjtctes",
        "ccw5tvhv5sewvnnnpkfa9ydn", "ne7yxpax4ebug4qz3p4jguej",
        "nsuasahckne72keu8qu6zjd3", "bvuy6mqmr7z7x8jw5f4zzpkr"
    ]
    calaises = [
        Calais(key, submitter="python-calais-demo") for key in API_KEYS
    ]
    entities = []
    calais = calaises[random.randint(0, len(calaises) - 1)]
    for i in range(retries):
        try:
            result = calais.analyze(text)
            if hasattr(result, 'entities'):
                for calais_entity in result.entities:
                    e_type = calais_entity['_type']
                    entities.append(e_type)
            return entities
        except:
            logging.exception("failed while calling calais")
            time.sleep(1)
    logging.error("failed with all tries to call calais")
    return entities
Example #22
0
### Ariana Giorgi
### 10/31/2014
### Computational Journalism Assignment #3 - Open Calais
### https://code.google.com/p/python-calais/

from calais import Calais
import collections

#set Calais API Key and create new instance
API_KEY = "g8gnzpdz52gkwyduv75zecem"
calais = Calais(API_KEY, submitter="python-calais demo")

#demo text
input_text = "George Bush was the President of the United States of America until 2009.  Barack Obama is the new President of the United States now."

with open('stdin_1.txt', 'r') as f:
    input_text = f.read()
f.closed

result = calais.analyze(input_text)
#result.print_entities()

#initialize dictionary that will contain the linked data
link_list = {}
#initialize detected references count (collected for assignment)
detected_count = 0

#loop through each entity and assign a link
for i in range(len(result.entities)):
    if 'resolutions' in result.entities[i]:
        #if Calais has assigned an RDF value, use that as the link
Example #23
0
def web_extract_terms(text, raw_query='',service='tagthe'):
    """
        Given a text, extract keyword terms with the selected web_service.
        Args:
            text: raw text from where to extract terms
            query: a query which may contextualize the extraction (only used by yahoo)
            service: which web service to use
        Returns:
            query: a sequence of query terms

    """
    service = service.lower().strip()

    if not service in WEB_SERVICES.keys():
        raise ValueError('%s is an invalid web service, possible choices are %s' % (service, repr(WEB_SERVICES.keys())))

    #1. Build the query:
    query = {}
    apikey = settings.WEB_SERVICES_KEYS.get(service, '')

    if service == 'wordsfinder':
        query = {
            'apikey' : apikey,
            'context': text + raw_query,
        }
    elif service == 'alchemy':
        query = {
            'apikey' : apikey,
            'text' : text + raw_query,
            'outputMode' : 'json'
        }
    elif service == 'yahoo':
        query = {
            'appid': apikey,
            'context': text,
            'output': 'json',
        }
        if raw_query:
            query.update({'query': raw_query})
    elif service == 'tagthe':
        query = {
            'text': text + raw_query,
            'view': 'json',
        }
    elif service == 'opencalais':
        #use the python interface, obtained from:
        #http://www.opencalais.com/applications/python-calais-python-interface-opencalais-api
        #logging.debug('Using the opencalais interface with key %s' % apikey)
        s = Calais(apikey)
        try:
            res = s.analyze(text + raw_query)
        except:
            raise WebServiceException(service, 'error in request')    
        #logging.debug('The raw response: %s'  % res.raw_response)
        if hasattr(res, 'topics') or hasattr(res, 'entities'):
            retval = [t['categoryName'] for t in res.topics] if hasattr(res, 'topics') else []
            retval += [t['name'] for t in res.entities] if hasattr(res, 'entities') else []
            return retval
        else:
            #raise WebServiceException(service, 'No topics or entities found')
            #logging.info("OpenCalais didn't return topics|entities for %s" %text)
            return ["",]
#    elif service == 'extractor':
#        #use the python interface
#        #logging.debug('using the extractor interface with key %s' % apikey)
#        extractor=ExtractorService(key=apikey, url=WEB_SERVICES[service])
#        raw_response = extractor.extract(text + raw_query)
#        #logging.debug('The raw response: %s' % raw_response)
#
#        if raw_response.get('ExtractionStatus') == '-1':
#            print "failure!"
#            raise WebServiceException(service, "Failure in request")
#        else:
#            #TODO: what DOES it return?
#            return raw_response



    #2. Try to call the service:
    resp = None
    #logging.debug('requesting %s' % WEB_SERVICES[service]+'?%s'%urlencode(query))
    try:
        #HACK: tagthe has issues with POST requests, so try and do a GET
        #max length for a GET request is 2048:
        #http://stackoverflow.com/questions/1344616/max-length-of-query-string-in-an-ajax-get-request
        if service == 'tagthe': # and len(urlencode(query)) <= 2048:
            resp_url = urlopen(WEB_SERVICES[service]+'?%s'%urlencode(query))       
        else:
            #HACK: fallback to yahoo if the request is too much for tagthe
            #service = 'yahoo' if service == 'tagthe' else service            
            resp_url = urlopen(WEB_SERVICES[service], urlencode(query))
        resp = resp_url.read()
        #this causes the exception...
        #logging.debug( u"%s returned %s" % (service, resp))        
    except Exception as e:
        #TODO: retry in timeouts and stuff
        #logging.debug('Error in request: %s' % e, exc_info = True)
        raise WebServiceException(service, 'Error in request to service : %s' % e)
            
    #3. Process the response:    
    if resp:
        result = ''
        if service == 'alchemy':
            data = json.loads(resp)
            if data['status'] == 'ERROR':
                raise WebServiceException(service, 'call returned error status')
            result = [re.sub('-[^ ] ', '', e['text']) for e in data['keywords']]
            
        elif service == 'yahoo':
            data = json.loads(resp)
            result = data['ResultSet']['Result']

        elif service == 'wordsfinder':
            parsed_response = parseString(resp)
            e = parsed_response.getElementsByTagName('error')
            if e:
                raise WebServiceException(service, 'error code %s' % e.firstChild.datad)
            else:
                result = [node.firstChild.data for node in parsed_response.getElementsByTagName('keyword')]
        elif service == 'tagthe':
            data = json.loads(resp)
            if 'memes' in data and 'dimensions' in data['memes'][0] and 'topic' in data['memes'][0]['dimensions']:
                result = data['memes'][0]['dimensions']['topic']
                #logging.debug(u'tagthe result %s' %result)
            else:
                result = ['', ]
            

        return [unescape(w) for w in result]


    # TODO: maybe find a way to call 'em all and have a super-set of kws?
    else:
        return ''
            os.path.join(
                os.path.split(inspect.getfile(inspect.currentframe()))[0],
                ".."))))
from calais import Calais
import codecs
import random
import psycopg2
from util import path_tools

USAGE = "python %s <bill-version-file> <bill|report>"
API_KEYS = [
    "wdbkpbpsksskkbm2rpqfm4xa", "mt5qu3e4jdrd6jpc9r9ecama",
    "k9fb7rfh7hpbfp238whuggrr", "55rared7un2pnjr23kjtctes"
]
MAX_TEXT_LENGTH = 100000
calaises = [Calais(key, submitter="python-calais-demo") for key in API_KEYS]

from util import configuration
CONN_STRING = configuration.get_connection_string()


class Entity:
    def __str__(self):
        return "%s | %s | %s | %d:%d" % (self.text, self.name, self.type,
                                         self.offset,
                                         self.offset + self.length)


def read_file(path):
    with codecs.open(path, 'r', 'utf8') as f:
        content = f.read()
Example #25
0
from calais import Calais

API_KEY = "s5mba8qn5qb4vjmc663qxn8m"
calais = Calais(API_KEY, submitter="jannae")

result = calais.analyze("George Bush was the President of the United States of America until 2009.  Barack Obama is the new President of the United States now.")

result.print_summary()
Example #26
0
for filename in file:
    fout = open(("results/" + filename + ".html"), "w")
    fout.write('<html>')
    fout.write('\r\n')
    fout.write('<head><title>' + filename + '</title></head>')
    fout.write('\r\n')
    fout.write('<body>')

    with open(("articles/" + filename + ".txt"), "r") as myfile:
        sys.stdin = myfile
        content = ""
        for line in sys.stdin:
            content += line

        API_KEY = "f7vhuv2kt4fxufuvv6eznwpe"
        calais = Calais(API_KEY, submitter="python-calais newsparser")
        result = calais.analyze(content)

        print "Summary of the Calais Analysis"
        result.print_summary()

        print "Entity of the Calais Analysis"
        result.print_entities()

        i = 0
        temp = []
        entityList = []
        html = []
        for entity in result.entities:
            if result.entities[i]["_type"] in [
                    "City", "Company", "Country", "Movie", "Organization",
Example #27
0
#!/usr/bin/env python

import sys
"""
Initialize requirements for OpenCalais
"""

from calais import Calais

CALAIS_API_KEY = 'ed42bg3ku3g3k98kv9kee78s'
calais = Calais(CALAIS_API_KEY, submitter="pagea1 tester")


def body2entities(body):
    """
    Given an article (STRING body), use the Open Calais named entity recognizer to return
    all entities therein.
    """
    names, companies, orgs, terms = [], [], [], []
    result = calais.analyze(body)
    for entity in result.entities:
        if (entity["_type"] == "Person"):
            names.append(entity["name"])
        if (entity["_type"] == "Company"):
            companies.append(entity["name"])
        if (entity["_type"] == "Organization"):
            orgs.append(entity["name"])
        if (entity["_type"] == "IndustryTerm"):
            terms.append(entity["name"])
    return names, companies, orgs, terms
import re
import sys
from calais import Calais #https://code.google.com/p/python-calais/

CALAIS_API_KEY = "an5duh4ktc5twbfysaakjhxs"
calais = Calais(CALAIS_API_KEY, submitter="historySnooper test")

f = open('history-sample.csv')
lines = f.readlines()
f.close()

class Topic:

	def __init__(self, name, value): #makes 1 call to users
		self.name = name
		self.value = value

class Entity:

	def __init__(self, name, value): #makes 1 call to users
		self.name = name
		self.value = value

topicsList = []
entitiesList = []
#id,lastVisitTime,title,typedCount,url,visitCount

lines.pop(0)
for line in lines:
	line = line.rstrip('\n')
	data = line.split(',')
Example #29
0
import io,sys
from calais import Calais
API_KEY = "rg72c9882ypfjm24tvrfu6ab"
calais = Calais(API_KEY, submitter="python-calais demo")

##read text file
with open('file3.txt', 'r') as content_file:
    content = content_file.read()

##perform analysis on text
result = calais.analyze(content)
result.print_entities()

html_file = open("HTMLFile3.html", "w")

##the resulting entities obtained are not sorted, so we sort them here:
def comp(x,y):
    return y['instances'][0]['offset'] - x['instances'][0]['offset']
sorted_results = sorted (result.entities,cmp=comp)

b = content.decode("utf-8")

for i in range (len(sorted_results)):   #for each entity
    offset = sorted_results[i]['instances'][0]['offset']    #find offset
    length = sorted_results[i]['instances'][0]['length']    #find length
    total = offset + length                                 #find total length

    if offset != sorted_results[i-1]['instances'][0]['offset']: #to prevent same words being linked twice
        if 'resolutions' in sorted_results[i]:  #if rdf document exists
            link = sorted_results[i]['resolutions'][0]['id']
            data = "<a href = \"" + link + "\" target=\"_blank\">" + sorted_results[i]['name'] + "</a>"
Example #30
0
		links_hash[real_url]['first_tweeted'] = min(links_hash[real_url]['first_tweeted'],link['first_tweeted'])
		links_hash[real_url]['total_tweets'] = links_hash[real_url]['total_tweets']+link['total_tweets']
		links_hash[real_url]['weighted_tweets'] = links_hash[real_url]['weighted_tweets']+link['weighted_tweets']
		calculateHotness(links_hash[real_url])
		links_hash[real_url]['tweeters'].extend(link['tweeters'])
		links_hash[real_url]['tweeters'].sort(key=lambda x: x['followers_count'],reverse=True)
	else:
		links_hash[real_url] = link

links = links_hash.values()

links.sort(key=lambda x: x['hotness'],reverse=True)


if not opts.min and not opts.no_classify:
	calais = Calais("***REMOVED***", submitter="python-calais classify")
	with open('savedclassifier.pickle','rb') as pkfile:
		classifier = pickle.load(pkfile)

for link in links:
	embedly = json.loads(link['embedly_blob'])

	if (not opts.min and not opts.no_classify) and link['sports_score'] is None:
		analysetext = ' '.join([embedly.get(x,'') for x in ['title', 'description', 'url'] if embedly.get(x,'') is not None])
		analysetext.encode("utf8")
		analysetext = analysetext.encode("utf8")
		analysetext= analysetext.replace('"', '\'')

		#core features extracted from classifier runs
		sportslist=['sports','nesn','weei','espn',#super types
			'Baseball','Hockey','Basketball','Football',#sports types
from calais import Calais
API_KEY = "djgq52vv8uufzykmnb9g7myv"
calais = Calais(API_KEY, submitter ="python-calais demo")
result = calais.analyze('''Microsoft is a big company. George Bush was the President of the United States
of America until 2009.  Barack Obama is the new President of the United States now.''')
result2 = calais.analyze('''Microsoft is a big company''')
result3 = calais.analyze('''Troubled drug development company SFBC International said on Monday it has changed its name to PharmaNet Development Group Inc. and will be traded on Nasdaq under the stock symbol "PDGI".''')
d={}
a=[]
for i in result3.entities:
    if i["_type"]=="Technology":
        a.append(i["name"])
        d["Technology"]=a;
        print d

Example #32
0
from calais import Calais

API_KEY = "k6s6cewwwc5zkemjqpw7yhru"
calais = Calais(API_KEY, submitter="python-calais demo")
result = calais.analyze("michelle obama")

print result.print_summary()
print result.print_topics()
print result.print_relations()

Example #33
0
def get_entities(content):
    API_KEY = os.environ['API_KEY']
    calais = Calais(API_KEY, submitter="python-calais demo")
    result = calais.analyze(content)
    result.print_entities()
Example #34
0
from bs4 import BeautifulSoup
from calais import Calais
import re
import dateutil.parser as dparser
from datetime import datetime
import unicodedata

#api key for calais
API_KEY = "g8gnzpdz52gkwyduv75zecem"
calais = Calais(API_KEY, submitter="Parsing TRACE Files")


def replace_accented(input_str):
    #from baseline.py
    nkfd_form = unicodedata.normalize('NFKD', input_str)
    return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])


for num in range(1, 172):
    print num

    with open("files/id" + str(num) + ".txt", 'r') as f:
        text = f.read()
    f.closed

    soup = BeautifulSoup(text)

    if len(soup.find_all('div', class_='msgBody')) == 0:
        # print "Not a case" #only files containing a div class with the name "msgBody" are non case files.
        # else: #now just the cases
        #PERP COMPANY
def main():
  # read in csv file to extract the email and addresses field
  # put email and addr into a list of tuples 
  email_addr = []
  with open('OECx_PH241x_3T2014_student_profile_info_2014-10-20-1645.csv', 'rU') as f:
      reader = csv.reader(f)
      for row in reader:
          pair = [row[3], row[9]]
          email_addr.append(pair)

  # create dictionary to find the country code | to iterate over the dictionary=> for key in d:
  country_code = {}
  with open('Country_code.csv', 'rU') as f:
      reader = csv.reader(f)
      for row in reader:
          key = row[0].split(' ', 1)[0].lower()
          value = row[0].split(' ', 1)[1].lower()
          country_code[key] = value

  # make Calais calls to extract country name
  api_key = 'wukyjrm778py5wry9qdtgk9u'
  calais = Calais(api_key)

  # dictionary to store all the results
  country_count = {}
  country_count['(TOTAL)'] = 0
  country_count['united states'] = 0
  country_count['~origin unknown'] = 0
  count = 0
  

  for pair in email_addr:
    check = 0
    try:
      response = {}
      if pair[1] != '':
        response = calais.analyze(pair[1])
        # if the addr contains country information
        if hasattr(response, 'entities'):
          # entry is a list of 3 elements: priority (3 for ProvinceOrState, 2 for Country, 1 for EmailAddress ), Country, Province 
          entry = [-1, '', '']
          for each in response.entities:
            if each['_type'] == 'ProvinceOrState':
              
              try: 
                entry[1] = each['resolutions'][0]['containedbycountry'].lower()
                entry[0] = 3
                entry[2] = each['name'].lower()
              except KeyError:
                print 'Country name cannot be retrieved'

            elif each['_type'] == 'Country':
              if entry[0] < 2:
                entry[0] = 2
                entry[1] = each['name'].lower()
            elif each['_type'] == 'EmailAddress':
              if entry[0] < 1:
                entry[0] = 1

          if entry[0] == 3:
            name = '(US) - ' + entry[2]
            if entry[1] not in country_count:
              country_count[entry[1]] = 1
            else:
              country_count[entry[1]] = 1 + country_count[entry[1]]
              if entry[1] == 'united states':
                if name not in country_count:
                  country_count[name] = 1 
                else:
                  country_count[name] = 1 + country_count[name] 

          elif entry[0] == 2:
            if entry[1] not in country_count:
              country_count[entry[1]] = 1   
            else:
              country_count[entry[1]] = 1 + country_count[entry[1]]

          elif entry[0] == 1:
            check = 1 # go through email check

          else:
            country_count['~origin unknown'] = country_count['~origin unknown'] + 1

      else: 
        check = 1

      # if addr is empty, query email address mapping table; if no entry, Unknown add 1
      # here we assume that all entries without addr and without strong indication of country origins in their emails will be categorized under the USA entry
      if check == 1: 
        # determine entry name
        email_endstr = pair[0].split('.')[-1].lower()
        if email_endstr in country_code:
          name = country_code[email_endstr]
        else:
          name = '~origin unknown'  
        # add entry 
        if name not in country_count:
          country_count[name] = 1
        else:
          country_count[name] = country_count[name]+1

    except ValueError:
      print 'Calais could not handle the language'
      country_count['~origin unknown'] = country_count['~origin unknown'] + 1
    count = count +1
    print 'Number of entries queried: ' + str(count)
  
  country_count['(TOTAL)'] = count
  country = sorted(country_count)

  print country
  us = 0
  with open('origin.csv', 'w') as fp:
    a = csv.writer(fp, delimiter=',')
    for key in country:
      if key != 'united states':
        a.writerow([key, country_count[key]])
      if us == 0:
        a.writerow(['united states', country_count['united states']])
        us = 1
def main():
  # read in csv file to extract the email and addresses field
  # put email and addr into a list of tuples 
  email_addr = []
  with open('OECx_PH241x_3T2014_student_profile_info_2014-10-20-1645.csv', 'rU') as f:
      reader = csv.reader(f)
      for row in reader:
          pair = [row[3], row[9]]
          email_addr.append(pair)

  # create dictionary to find the country code | to iterate over the dictionary=> for key in d:
  country_code = {}
  with open('Country_code.csv', 'rU') as f:
      reader = csv.reader(f)
      for row in reader:
          key = row[0].split(' ', 1)[0].lower()
          value = row[0].split(' ', 1)[1].lower()
          country_code[key] = value

  # make Calais calls to extract country name
  api_key = 'wukyjrm778py5wry9qdtgk9u'
  calais = Calais(api_key)

  # dictionary to store all the results
  country_count = {}
  country_count['united states'] = 0
  count = 0
  
  for pair in email_addr:
    try:
      response = {}
      if pair[1] != '':
        response = calais.analyze(pair[1])
      # if the addr contains country information
      name = ''
      if hasattr(response, 'entities'):
        print response.entities
        name = response.entities[0]['containedbycountry'].lower()
        if '@' in name: #where email addresses are wrongly entered as addr
          last_str = pair[0].split('.')[-1].lower()
          if last_str in country_code:
            name = country_code[last_str]
          else:
            name = 'united states'
          if name not in country_count:
            country_count[name] = 1
          else:
            country_count[name] = country_count[name]+1
        else:
          if name not in country_count:
            country_count[name] = 1
          else:
            country_count[name] = country_count[name]+1
      # otherwise, check the email addr    
      else:
        last_str = pair[0].split('.')[-1].lower()
        if last_str in country_code:
          name = country_code[last_str]
        else:
          name = 'united states'
        if name not in country_count:
          country_count[name] = 1
        else:
          country_count[name] = country_count[name]+1
    except ValueError:
      print 'Calais could not handle the language'
    count = count +1
    print 'Number of entries queried: ' + str(count)
  
  print country_count

  with open('countrybreakdown.csv', 'w') as fp:
    a = csv.writer(fp, delimiter=',')
    for key in country_count:
      a.writerow([key, country_count[key]])
            return
        
    process_type = settings.DEFAULT_PROCESS_TYPE
    if 'contentType' in settings.PROCESSING_DIR:
        d_proc_type = proc_dir['contentType']

    if 'fields' not in params:
        if settings.ST_DEBUG:
            raise Exception('No "fields" found.')
        else:
            return

    # Create the instance of Calais and setup the parameters,
    # see open-calais.com for more information about user directives,
    # and processing directives
    c = Calais(settings.API_KEY)
    c.user_directives.update(settings.USER_DIR)
    c.processing_directives.update(settings.PROCESSING_DIR)
    c.processing_directives['contentType'] = process_type

    processed_tags = []
    for item in params['fields']:
        try:
            d = item.copy()
            
            field = d.pop('name')
            proc_type = d.pop('process_type', process_type)
            markup = d.pop('markup', False)

            data = getattr(obj, field)
Example #38
0
score float NOT NULL,
PRIMARY KEY (ID)
);""")
cursor.execute("""CREATE TABLE tweet_entity
(
ID int NOT NULL AUTO_INCREMENT,
tweet_id bigint(20) NOT NULL,
entity varchar(255) NOT NULL,
relevance float NOT NULL,
PRIMARY KEY (ID)
);""")
q_topic_insert = "INSERT INTO tweet_topic (tweet_id,topic,score) VALUES (%s,%s,%s);"
q_entity_insert = "INSERT INTO tweet_entity (tweet_id,entity,relevance) VALUES (%s,%s,%s);"

# Configure Calais connection
calais = Calais(config.calais_api_key, submitter=config.calais_user)

userprofiles = {}
last_time = time.time()

for username in usernames:
    if limit == None:
        cursor.execute("SELECT id, content FROM tweets_sample WHERE username='******'")
    else:
        cursor.execute("SELECT id, content FROM tweets_sample WHERE username='******' LIMIT " + str(limit))
    data = cursor.fetchall()
    
    analyzed = 0
    skipped = 0
    topics = {}
    no_topics = 0
Example #39
0
def classify(real_url_hash):
   
   API_KEY = "***REMOVED***"
   '''
   # Open database connection
   db = MySQLdb.connect(
       host = '127.0.0.1', 
       user = '******', 
       passwd = 'globelab', 
       db = 'condor', 
       port = 3307)
   '''
   try:
	with open('config-local.json') as fh:
		config = json.load(fh)
   except IOError:
	with open('config.json') as fh:
		config = json.load(fh)

   db = MySQLdb.connect(
	host=config['mysql']['host'],
	port=config['mysql']['port'],
 	user=config['mysql']['user'],
 	passwd=config['mysql']['password'],
 	db=config['mysql']['database'],
	use_unicode=True,
    	charset="utf8")
  
   db.autocommit(True)

   #call calais function
   calais = Calais(API_KEY, submitter="python-calais classify")

   #run from train set
   pkfile = open('savedclassifier.pickle','rb')
   classifier = pickle.load(pkfile)

   # prepare a cursor object using cursor() method
   cursor = db.cursor()

   # Prepare SQL query to INSERT a record into the database.
   sql = "SELECT embedly_blob,real_url_hash  FROM  url_info WHERE real_url_hash='" + real_url_hash + "' LIMIT 1;" 

   # Execute the SQL command
   cursor.execute(sql)

   # Fetch all the rows
   results = cursor.fetchall()

   #browse through results
   for row in results:

      #get results
      real_url_hash = row[1]
      jsondecode = json.loads(row[0])
      title = jsondecode['title']
      description = jsondecode['description']
      url = jsondecode['url']

      if title and description and url:
         analysetext =  title + ' ' + description + ' ' + url
      else:
         analysetext = ' '
         if title:
            analysetext = analysetext  + ' ' +  title
         if description:
            analysetext = analysetext  + ' ' +  description
         if url:
            analysetext = analysetext  + ' ' +  url
            
      analysetext.encode("utf8")
      analysetext = analysetext.encode("utf8")
      analysetext= analysetext.replace('"', '\'')
      
      
      #classifier 1
      #naive bayes
      _topic = classifier.classify(analysetext)
      _score = classifier.get_score(analysetext)
      
      #classifier 2
      #opencalais crosscheck   
      try:
         result = calais.analyze(analysetext)
         topic =  result.get_topics()
         score =  result.get_topic_score()
      except:
         topic = "None"
         score = 0.0
         sqlerr = 'INSERT INTO error_classify (real_url_hash, text) VALUES("' + real_url_hash+ '","' + analysetext + '");'
         cursor.execute(sqlerr)
         
      print real_url_hash
      print analysetext
      print topic
      print _topic
      
      #create json output      
      jsonOutput = "{\"topic\":\"%s\" , \"score\":\"%f\" , \"_topic\":\"%s\" , \"_score\":\"%f\"}" % (topic, score, _topic, _score) 
      sqlupdate = "UPDATE url_info SET topic_blob=\'" + jsonOutput + "\' WHERE real_url_hash=\'" + real_url_hash + "\';"
      x = cursor.execute(sqlupdate)
      trace =  'trace: updated url_info, url hash [%s]: %d' % (real_url_hash, x)

      #update score
      if topic == "Sports":
         cursor.execute("UPDATE url_info SET sports_score=\'" + str(score) + "\' WHERE real_url_hash=\'" + real_url_hash + "\';")
      elif topic == "None" and _topic == "sports":
         cursor.execute("UPDATE url_info SET sports_score=\'" + str(_score) + "\' WHERE real_url_hash=\'" + real_url_hash + "\';")
      else:
         cursor.execute("UPDATE url_info SET sports_score='0' WHERE real_url_hash=\'" + real_url_hash + "\';")

   db.close()
   pkfile.close()
Example #40
0
            if leftToWait>0:
                time.sleep(leftToWait)
            ret = func(*args,**kargs)
            lastTimeCalled[0] = time.clock()
            return ret
        return rateLimitedFunction
    return decorate


Manager = QueryManager()
# Calais Rate limits our analysis requests, 
# this throttles requests without needing to sleep
# AnalysisQueue = queue.Queue(.4);
# AnalysisQueue.execute()
key = "c3wjfrkfmrsft3r5wgxm5skr"
CalaisObj = Calais(key, submitter="Sam Purcell")

def pr(*args):
  print args[0] % (len(args) > 1 and args[1:] or [])
  sys.stdout.flush()

def tryConnection (applyfun): 
    try:
        return applyfun()
    except exc.SQLAlchemyError:
        db.session.rollback()
        return applyfun()

class News():
    normalizers = {
        "feedzilla" : {
Example #41
0
File: poem.py Project: jannae/rwet
from calais import Calais

API_KEY = "s5mba8qn5qb4vjmc663qxn8m"
calais = Calais(API_KEY, submitter="jannae")

result = calais.analyze_file("sotu2012.txt")

result.print_summary()

def print_entities(self):
    if not hasattr(self, "entities"):
        return None
    for item in self.entities:
        print "%s: %s (%.2f)" % (item['_type'], item['name'], item['relevance'])
from calais import Calais
import os

API_KEY = "v5q6rvm7h4uww6sumjxuw9t7"
calais = Calais(API_KEY, submitter="python-calais demo")

f = open("Text/test.txt", 'r+')
NYT = f.read()
f.close()

result = calais.analyze(NYT)

result.print_summary()
result.print_entities()
Example #43
0
def get_entities(content):
    API_KEY = os.environ['API_KEY']
    calais = Calais(API_KEY, submitter="python-calais demo")
    result = calais.analyze(content)
    result.print_entities()
Example #44
0
from calais import Calais
API_KEY = "dkm645ejqmq7aajt8cp6zxk7"
calais = Calais(API_KEY, submitter="python-calais demo")
result = calais.analyze(
    "My 15 year old Daughter has sores in her genital area and her mouth.She swares she did nothing but kiss her boyfriend.She also has flu-like symptoms."
)
result.print_summary()
result.print_entities()
result.print_topics()
p = raw_input()