def get_calais_subjects(text, uid): registry = getUtility(IRegistry) settings = registry.forInterface(ITagHelperSettingsSchema) api_key = settings.calais_api_key relevance = settings.calais_relevance subjects=[] if api_key: calais = Calais(api_key) try: result = calais.analyze(text, external_id = uid) except: return [] #if hasattr( result, 'entities'): # for entity in result.entities: # if entity['_type'] in PREFERRED_ENTITIES: # subjects.append(entity['name']) if hasattr( result, 'socialTag'): for tag in result.socialTag: if float(tag['importance']) > relevance: subjects.append(tag['name']) #if hasattr( result, 'relations'): # for fact in result.relations: # if fact['_type'] in PREFERRED_FACTS: # ft = fact.get(fact['_type'].lower()) # if ft: # subjects.append(ft) return subjects
def populate_database_with_semantic_data_from_comments(calais_api_key, db_cursor, debug): calais = Calais(calais_api_key, submitter='usermine') db_cursor.execute('SELECT id, comment FROM comments WHERE updated = 0') for comment_data in db_cursor.fetchall(): id = comment_data[0] comment = comment_data[1] try: result = calais.analyze(comment) if hasattr(result, 'entities'): for entity in result.entities: entity_name = entity['name'] db_cursor.execute('INSERT INTO entities (id, entity) VALUES (NULL, ?)', [entity_name]) if hasattr(result, 'topics'): for topic in result.topics: topic_name = topic['categoryName'] db_cursor.execute('INSERT INTO topics (id, topic) VALUES (NULL, ?)', [topic_name]) except: if debug: print sys.exc_info() db_cursor.execute('UPDATE comments SET updated=1 WHERE id = ?', [id]) if debug: print '.'
def process_calais(content, key): calais = Calais(key) response = calais.analyze(content) people = [entity["name"] for entity in getattr(response, "entities", []) if entity["_type"] == "Person"] return {"people": people}
def get_calais_subjects(text, uid): registry = getUtility(IRegistry) settings = registry.forInterface(ITagHelperSettingsSchema) api_key = settings.calais_api_key relevance = settings.calais_relevance subjects = [] if api_key: calais = Calais(api_key) try: result = calais.analyze(text, external_id=uid) except: return [] # if hasattr( result, 'entities'): # for entity in result.entities: # if entity['_type'] in PREFERRED_ENTITIES: # subjects.append(entity['name']) if hasattr(result, "socialTag"): for tag in result.socialTag: if float(tag["importance"]) > relevance: subjects.append(tag["name"]) # if hasattr( result, 'relations'): # for fact in result.relations: # if fact['_type'] in PREFERRED_FACTS: # ft = fact.get(fact['_type'].lower()) # if ft: # subjects.append(ft) return subjects
def getOpenCalaisResultFromURL(url): API_KEY = openCalaisKeys[randint(0,6)] calais = Calais(API_KEY, submitter="python-calais demo") try: result = calais.analyze_url(url) except: result = None return result
def analyze(request): API_KEY = 'kgmykdr862hdfhkzkuchnxkc' calais = Calais(API_KEY, 'python') try: result = calais.analyze(request.POST['content']) if result: return HttpResponse(json.dumps(result.get_json_entities()), mimetype="application/json") else: return HttpResponse(json.dumps(""), mimetype="application/json") except(KeyError): return HttpResponse(json.dumps(""), mimetype="application/json")
def calaisify_main(options, args): m = persistence.MongoPersistence() calais = Calais(config.calais["api_key"], submitter="python-calais demo") for d in m.get_non_calaised_entries(): l = short_url.has(d["title"]) if len(l): for url in l: long_url = short_url.find_real(url) response = calais.analyze_url(long_url) response.print_summary() response.print_topics() response.print_entities() else: pass
def contract(request, contract): response = {} collection = connection() blocks = collection.find( { 'contracts':contract}, sort=[('date', ASCENDING)] ) # get last block and display if it was approved or not calais = Calais(settings.CALAIS, submitter="hello world") newbloks = [] for block in blocks: block['entities'] = calais.analyze(block['string']).entities newbloks.append(block) response['items'] = newbloks response['contract'] = contract return render_to_response('contract.html', response)
def analyze(model, text=None, backend='calais'): from calais import Calais from django.conf import settings calais = Calais(settings.CALAIS_API_KEY, submitter='newscredit') if not text: _text = model.analysis_text() else: _text = text # we cannot analyse if our total content is under 100 characters # after HTML cleaning. We leave OpenCalais to do this as they have # advanced heuristics to do it. If our text to analyse is less than # 100 characters, we skip the analysis. if len( _text ) < 100: return result = calais.analyze(_text) records = [] try: for entity in result.entities: if entity['_type'] == 'Person': _model = Person elif entity['_type'] in [ 'City', 'Country', 'Continent', 'Facility', 'Region' ]: _model = Place elif entity['_type'] in [ 'Organization', 'Company' ]: _model = Organisation else: continue try: _record = _model.objects.get(uri=entity['__reference']) except _model.DoesNotExist, e: _record = _model() _record.from_calais(entity) _record.save() model.add_entity(_record) records.append(_record) except AttributeError: # this happens if Calais throws an error. To ensure we continue # processing other records pass this error and return False return False return result, records
def analyze(model, text=None, backend='calais'): from calais import Calais from django.conf import settings calais = Calais(settings.CALAIS_API_KEY, submitter='newscredit') if text: result = calais.analyze(text) else: result = calais.analyze(model.analysis_text()) people = [] for entity in result.entities: if entity['_type'] == 'Person': try: person = Person.objects.get(uri=entity['__reference']) except Person.DoesNotExist, e: person = Person() person.from_calais(entity) person.save() people.append(person)
def calais_test(request): content_url= request.REQUEST['content_url'] if content_url == '': content_url='http://n-tv.de' calais = Calais('pc5v39x8sq3mh4mv9zm2ppre' , submitter="ask-a-sap-question") result= calais.analyze_url(content_url) result.print_summary() c_data= { "name" : "Barack", } for entity in result.entities: c_data[entity['name']]=entity['_type'] lang= result.doc['meta']['language'] return render_to_response('calais_test.html',{ 'text': content_url, 'calais_data': c_data, 'language': lang})
def __init__(self): self.db_name = "pythia_db" self.connections = { "db": connect(self.db_name), "calais": Calais("av536xwvy4mgmcbw9cancqmd", submitter="pythia-application") }
def main(): """ Main method """ if not sys.stdin: usage() sys.exit(1) input = sys.stdin.read() # read from stdin input = input.replace("<", "") input = input.replace(">", "") api_key = config.read_config('calais', 'api_key') calais = Calais(api_key, submitter="fridakahlo") entities = calais.analyze(input).entities #for e in entities: print e['name'], len(e['instances']), '<br>' linked_text = add_links(entities, input) for line in linked_text.splitlines(): print "<p>", line, "<p>"
def entities(env, start_response): """Extracts entities from resume utilizing the OpenCalais webservice.""" start_response('200 OK', [('Content-Type', 'text/xml')]) API_KEY = "kqyhhfppufvmvxspkstwjxw5" calais = Calais(API_KEY, submitter="resume_analysis") try: with open('Darin_Plutchok_Resume_Taxonomist.txt') as f: text = f.read() except: raise restlite.Status, '400 Error Reading File' try: results = calais.analyze(text) except Exception as e: return "<error>%s</error>" % e entities_tuples = [(entity['name'], entity['_type'], entity['relevance']) for entity in results.entities] doc = create_xml({'entities': entities_tuples}) return [str(doc.toxml())]
def _get_people(text): ''' Runs input text through Calais to extract people, coreferences and their locations. This function returns a canonical name for any given source in the document and contextual information about where coreferences appear, based on the text before and after the pronoun occurrance. Takes full story text as input. This is a pretty bare-bones function. It doesn't handle Calais API errors, so it tends to crap out from time to time. Future refinements should account for this. ''' # Run input text through Calais calais = Calais(API_KEY, submitter="tbc-coref-test") annotations = calais.analyze(text) # If no entities come back, peace out if not hasattr(annotations, 'entities'): return False coref = {} # Dictionary to hold our corefence object. for e in annotations.entities: instances = [] # We only care about Person entities, not companies, places, etc. if e['_type'] == 'Person': # For each instance of that entity (which includes pronouns and other references) ... for i in e['instances']: # Collect the coreference text (exact) the preceding text (prefix) and the # following text (suffix) for reference information. We'll need this later. instances.append( (i.get('exact'), i.get('suffix', ''), i.get('prefix', ''))) # Associate the canonical name with the coreference and context information gathered # above for use later. name = e.get("commonname", e.get('name', None)) coref[name] = instances return coref
def __init__(self, content_object, content_fields=None): super(OpenCalais, self).__init__(content_object) self.calais = Calais(settings.CALAIS_API_KEY, settings.CALAIS_SUBMITTER) if content_fields: self.calais_content_fields = content_fields else: try: self.calais_content_fields = dict( self.content_object.__class__.calais_content_fields) except FieldDoesNotExist, e: raise OpenCalaisTagFetchError( 'You need to define calais_content_fields: %s' % e)
def _get_people(text): ''' Runs input text through Calais to extract people, coreferences and their locations. This function returns a canonical name for any given source in the document and contextual information about where coreferences appear, based on the text before and after the pronoun occurrance. Takes full story text as input. This is a pretty bare-bones function. It doesn't handle Calais API errors, so it tends to crap out from time to time. Future refinements should account for this. ''' # Run input text through Calais calais = Calais(API_KEY, submitter="tbc-coref-test") annotations = calais.analyze(text) # If no entities come back, peace out if not hasattr(annotations, 'entities'): return False coref = {} # Dictionary to hold our corefence object. for e in annotations.entities: instances = [] # We only care about Person entities, not companies, places, etc. if e['_type'] == 'Person': # For each instance of that entity (which includes pronouns and other references) ... for i in e['instances']: # Collect the coreference text (exact) the preceding text (prefix) and the # following text (suffix) for reference information. We'll need this later. instances.append((i.get('exact'), i.get('suffix', ''), i.get('prefix', ''))) # Associate the canonical name with the coreference and context information gathered # above for use later. name = e.get("commonname", e.get('name', None)) coref[name] = instances return coref
class classifier(): API_KEY = 'xyby6x47ycxj56bkkb83s9he' def __init__(self): self.calais = Calais(self.API_KEY, submitter='SocialTV Demo') def process(self, ip, text): result = self.calais.analyze(text) all_tags = [] if hasattr(result, 'entities'): all_tags.extend(result.entities) if hasattr(result, 'socialTag'): #all_tags.extend(result.socialTag) pass if len(all_tags) == 0: return [] output = [] for tag in all_tags: if tag['name'].lower() not in dirtyWords.DIRTY: qwiki = self._get_qwiki_url(tag['name']) if len(qwiki) > 0: img = self._get_google_image(ip, tag['name']) print "Tag: "+tag['name'] output.append({'text':text ,'tag': tag['name'], 'qwiki':qwiki, 'img': img}) return output def _get_qwiki_url(self, text): try: tmp_txt = text.replace(' ', '_') response = urllib2.urlopen("http://www.qwiki.com/embed/"+urllib2.quote(tmp_txt)+"?autoplay=true") return "http://www.qwiki.com/embed/"+urllib2.quote(tmp_txt)+"?autoplay=true" except urllib2.URLError, e: response = urllib2.urlopen("http://embed-api.qwiki.com/api/v1/search.json?count=1&q="+urllib2.quote(text)) html = response.read() html_eval = json.loads(html) if len(html_eval) > 0: return self._process_qwiki_results(text, html_eval) else: return []
class calaisApi: def __init__(self): self.calais = Calais(KEY, submitter="GodelTest") @persistent_memoize def calais_run(self, sentence): entities = [] try: result = self.calais.analyze(sentence) except ValueError: return if hasattr(result, "entities"): if len(result.entities) > 0: for results in result.entities: entities.append(results) if len(entities) > 0: return entities else: return False
def extract_entities(text, retries=5): """ Input: entity_text Output: calais entity """ import time sys.path.insert( 0, os.path.realpath( os.path.abspath( os.path.join( os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../ner")))) from calais import Calais random.seed(text) API_KEYS = [ "wdbkpbpsksskkbm2rpqfm4xa", "mt5qu3e4jdrd6jpc9r9ecama", "k9fb7rfh7hpbfp238whuggrr", "55rared7un2pnjr23kjtctes", "ccw5tvhv5sewvnnnpkfa9ydn", "ne7yxpax4ebug4qz3p4jguej", "nsuasahckne72keu8qu6zjd3", "bvuy6mqmr7z7x8jw5f4zzpkr" ] calaises = [ Calais(key, submitter="python-calais-demo") for key in API_KEYS ] entities = [] calais = calaises[random.randint(0, len(calaises) - 1)] for i in range(retries): try: result = calais.analyze(text) if hasattr(result, 'entities'): for calais_entity in result.entities: e_type = calais_entity['_type'] entities.append(e_type) return entities except: logging.exception("failed while calling calais") time.sleep(1) logging.error("failed with all tries to call calais") return entities
### Ariana Giorgi ### 10/31/2014 ### Computational Journalism Assignment #3 - Open Calais ### https://code.google.com/p/python-calais/ from calais import Calais import collections #set Calais API Key and create new instance API_KEY = "g8gnzpdz52gkwyduv75zecem" calais = Calais(API_KEY, submitter="python-calais demo") #demo text input_text = "George Bush was the President of the United States of America until 2009. Barack Obama is the new President of the United States now." with open('stdin_1.txt', 'r') as f: input_text = f.read() f.closed result = calais.analyze(input_text) #result.print_entities() #initialize dictionary that will contain the linked data link_list = {} #initialize detected references count (collected for assignment) detected_count = 0 #loop through each entity and assign a link for i in range(len(result.entities)): if 'resolutions' in result.entities[i]: #if Calais has assigned an RDF value, use that as the link
def web_extract_terms(text, raw_query='',service='tagthe'): """ Given a text, extract keyword terms with the selected web_service. Args: text: raw text from where to extract terms query: a query which may contextualize the extraction (only used by yahoo) service: which web service to use Returns: query: a sequence of query terms """ service = service.lower().strip() if not service in WEB_SERVICES.keys(): raise ValueError('%s is an invalid web service, possible choices are %s' % (service, repr(WEB_SERVICES.keys()))) #1. Build the query: query = {} apikey = settings.WEB_SERVICES_KEYS.get(service, '') if service == 'wordsfinder': query = { 'apikey' : apikey, 'context': text + raw_query, } elif service == 'alchemy': query = { 'apikey' : apikey, 'text' : text + raw_query, 'outputMode' : 'json' } elif service == 'yahoo': query = { 'appid': apikey, 'context': text, 'output': 'json', } if raw_query: query.update({'query': raw_query}) elif service == 'tagthe': query = { 'text': text + raw_query, 'view': 'json', } elif service == 'opencalais': #use the python interface, obtained from: #http://www.opencalais.com/applications/python-calais-python-interface-opencalais-api #logging.debug('Using the opencalais interface with key %s' % apikey) s = Calais(apikey) try: res = s.analyze(text + raw_query) except: raise WebServiceException(service, 'error in request') #logging.debug('The raw response: %s' % res.raw_response) if hasattr(res, 'topics') or hasattr(res, 'entities'): retval = [t['categoryName'] for t in res.topics] if hasattr(res, 'topics') else [] retval += [t['name'] for t in res.entities] if hasattr(res, 'entities') else [] return retval else: #raise WebServiceException(service, 'No topics or entities found') #logging.info("OpenCalais didn't return topics|entities for %s" %text) return ["",] # elif service == 'extractor': # #use the python interface # #logging.debug('using the extractor interface with key %s' % apikey) # extractor=ExtractorService(key=apikey, url=WEB_SERVICES[service]) # raw_response = extractor.extract(text + raw_query) # #logging.debug('The raw response: %s' % raw_response) # # if raw_response.get('ExtractionStatus') == '-1': # print "failure!" # raise WebServiceException(service, "Failure in request") # else: # #TODO: what DOES it return? # return raw_response #2. Try to call the service: resp = None #logging.debug('requesting %s' % WEB_SERVICES[service]+'?%s'%urlencode(query)) try: #HACK: tagthe has issues with POST requests, so try and do a GET #max length for a GET request is 2048: #http://stackoverflow.com/questions/1344616/max-length-of-query-string-in-an-ajax-get-request if service == 'tagthe': # and len(urlencode(query)) <= 2048: resp_url = urlopen(WEB_SERVICES[service]+'?%s'%urlencode(query)) else: #HACK: fallback to yahoo if the request is too much for tagthe #service = 'yahoo' if service == 'tagthe' else service resp_url = urlopen(WEB_SERVICES[service], urlencode(query)) resp = resp_url.read() #this causes the exception... #logging.debug( u"%s returned %s" % (service, resp)) except Exception as e: #TODO: retry in timeouts and stuff #logging.debug('Error in request: %s' % e, exc_info = True) raise WebServiceException(service, 'Error in request to service : %s' % e) #3. Process the response: if resp: result = '' if service == 'alchemy': data = json.loads(resp) if data['status'] == 'ERROR': raise WebServiceException(service, 'call returned error status') result = [re.sub('-[^ ] ', '', e['text']) for e in data['keywords']] elif service == 'yahoo': data = json.loads(resp) result = data['ResultSet']['Result'] elif service == 'wordsfinder': parsed_response = parseString(resp) e = parsed_response.getElementsByTagName('error') if e: raise WebServiceException(service, 'error code %s' % e.firstChild.datad) else: result = [node.firstChild.data for node in parsed_response.getElementsByTagName('keyword')] elif service == 'tagthe': data = json.loads(resp) if 'memes' in data and 'dimensions' in data['memes'][0] and 'topic' in data['memes'][0]['dimensions']: result = data['memes'][0]['dimensions']['topic'] #logging.debug(u'tagthe result %s' %result) else: result = ['', ] return [unescape(w) for w in result] # TODO: maybe find a way to call 'em all and have a super-set of kws? else: return ''
os.path.join( os.path.split(inspect.getfile(inspect.currentframe()))[0], "..")))) from calais import Calais import codecs import random import psycopg2 from util import path_tools USAGE = "python %s <bill-version-file> <bill|report>" API_KEYS = [ "wdbkpbpsksskkbm2rpqfm4xa", "mt5qu3e4jdrd6jpc9r9ecama", "k9fb7rfh7hpbfp238whuggrr", "55rared7un2pnjr23kjtctes" ] MAX_TEXT_LENGTH = 100000 calaises = [Calais(key, submitter="python-calais-demo") for key in API_KEYS] from util import configuration CONN_STRING = configuration.get_connection_string() class Entity: def __str__(self): return "%s | %s | %s | %d:%d" % (self.text, self.name, self.type, self.offset, self.offset + self.length) def read_file(path): with codecs.open(path, 'r', 'utf8') as f: content = f.read()
from calais import Calais API_KEY = "s5mba8qn5qb4vjmc663qxn8m" calais = Calais(API_KEY, submitter="jannae") result = calais.analyze("George Bush was the President of the United States of America until 2009. Barack Obama is the new President of the United States now.") result.print_summary()
for filename in file: fout = open(("results/" + filename + ".html"), "w") fout.write('<html>') fout.write('\r\n') fout.write('<head><title>' + filename + '</title></head>') fout.write('\r\n') fout.write('<body>') with open(("articles/" + filename + ".txt"), "r") as myfile: sys.stdin = myfile content = "" for line in sys.stdin: content += line API_KEY = "f7vhuv2kt4fxufuvv6eznwpe" calais = Calais(API_KEY, submitter="python-calais newsparser") result = calais.analyze(content) print "Summary of the Calais Analysis" result.print_summary() print "Entity of the Calais Analysis" result.print_entities() i = 0 temp = [] entityList = [] html = [] for entity in result.entities: if result.entities[i]["_type"] in [ "City", "Company", "Country", "Movie", "Organization",
#!/usr/bin/env python import sys """ Initialize requirements for OpenCalais """ from calais import Calais CALAIS_API_KEY = 'ed42bg3ku3g3k98kv9kee78s' calais = Calais(CALAIS_API_KEY, submitter="pagea1 tester") def body2entities(body): """ Given an article (STRING body), use the Open Calais named entity recognizer to return all entities therein. """ names, companies, orgs, terms = [], [], [], [] result = calais.analyze(body) for entity in result.entities: if (entity["_type"] == "Person"): names.append(entity["name"]) if (entity["_type"] == "Company"): companies.append(entity["name"]) if (entity["_type"] == "Organization"): orgs.append(entity["name"]) if (entity["_type"] == "IndustryTerm"): terms.append(entity["name"]) return names, companies, orgs, terms
import re import sys from calais import Calais #https://code.google.com/p/python-calais/ CALAIS_API_KEY = "an5duh4ktc5twbfysaakjhxs" calais = Calais(CALAIS_API_KEY, submitter="historySnooper test") f = open('history-sample.csv') lines = f.readlines() f.close() class Topic: def __init__(self, name, value): #makes 1 call to users self.name = name self.value = value class Entity: def __init__(self, name, value): #makes 1 call to users self.name = name self.value = value topicsList = [] entitiesList = [] #id,lastVisitTime,title,typedCount,url,visitCount lines.pop(0) for line in lines: line = line.rstrip('\n') data = line.split(',')
import io,sys from calais import Calais API_KEY = "rg72c9882ypfjm24tvrfu6ab" calais = Calais(API_KEY, submitter="python-calais demo") ##read text file with open('file3.txt', 'r') as content_file: content = content_file.read() ##perform analysis on text result = calais.analyze(content) result.print_entities() html_file = open("HTMLFile3.html", "w") ##the resulting entities obtained are not sorted, so we sort them here: def comp(x,y): return y['instances'][0]['offset'] - x['instances'][0]['offset'] sorted_results = sorted (result.entities,cmp=comp) b = content.decode("utf-8") for i in range (len(sorted_results)): #for each entity offset = sorted_results[i]['instances'][0]['offset'] #find offset length = sorted_results[i]['instances'][0]['length'] #find length total = offset + length #find total length if offset != sorted_results[i-1]['instances'][0]['offset']: #to prevent same words being linked twice if 'resolutions' in sorted_results[i]: #if rdf document exists link = sorted_results[i]['resolutions'][0]['id'] data = "<a href = \"" + link + "\" target=\"_blank\">" + sorted_results[i]['name'] + "</a>"
links_hash[real_url]['first_tweeted'] = min(links_hash[real_url]['first_tweeted'],link['first_tweeted']) links_hash[real_url]['total_tweets'] = links_hash[real_url]['total_tweets']+link['total_tweets'] links_hash[real_url]['weighted_tweets'] = links_hash[real_url]['weighted_tweets']+link['weighted_tweets'] calculateHotness(links_hash[real_url]) links_hash[real_url]['tweeters'].extend(link['tweeters']) links_hash[real_url]['tweeters'].sort(key=lambda x: x['followers_count'],reverse=True) else: links_hash[real_url] = link links = links_hash.values() links.sort(key=lambda x: x['hotness'],reverse=True) if not opts.min and not opts.no_classify: calais = Calais("***REMOVED***", submitter="python-calais classify") with open('savedclassifier.pickle','rb') as pkfile: classifier = pickle.load(pkfile) for link in links: embedly = json.loads(link['embedly_blob']) if (not opts.min and not opts.no_classify) and link['sports_score'] is None: analysetext = ' '.join([embedly.get(x,'') for x in ['title', 'description', 'url'] if embedly.get(x,'') is not None]) analysetext.encode("utf8") analysetext = analysetext.encode("utf8") analysetext= analysetext.replace('"', '\'') #core features extracted from classifier runs sportslist=['sports','nesn','weei','espn',#super types 'Baseball','Hockey','Basketball','Football',#sports types
from calais import Calais API_KEY = "djgq52vv8uufzykmnb9g7myv" calais = Calais(API_KEY, submitter ="python-calais demo") result = calais.analyze('''Microsoft is a big company. George Bush was the President of the United States of America until 2009. Barack Obama is the new President of the United States now.''') result2 = calais.analyze('''Microsoft is a big company''') result3 = calais.analyze('''Troubled drug development company SFBC International said on Monday it has changed its name to PharmaNet Development Group Inc. and will be traded on Nasdaq under the stock symbol "PDGI".''') d={} a=[] for i in result3.entities: if i["_type"]=="Technology": a.append(i["name"]) d["Technology"]=a; print d
from calais import Calais API_KEY = "k6s6cewwwc5zkemjqpw7yhru" calais = Calais(API_KEY, submitter="python-calais demo") result = calais.analyze("michelle obama") print result.print_summary() print result.print_topics() print result.print_relations()
def get_entities(content): API_KEY = os.environ['API_KEY'] calais = Calais(API_KEY, submitter="python-calais demo") result = calais.analyze(content) result.print_entities()
from bs4 import BeautifulSoup from calais import Calais import re import dateutil.parser as dparser from datetime import datetime import unicodedata #api key for calais API_KEY = "g8gnzpdz52gkwyduv75zecem" calais = Calais(API_KEY, submitter="Parsing TRACE Files") def replace_accented(input_str): #from baseline.py nkfd_form = unicodedata.normalize('NFKD', input_str) return u"".join([c for c in nkfd_form if not unicodedata.combining(c)]) for num in range(1, 172): print num with open("files/id" + str(num) + ".txt", 'r') as f: text = f.read() f.closed soup = BeautifulSoup(text) if len(soup.find_all('div', class_='msgBody')) == 0: # print "Not a case" #only files containing a div class with the name "msgBody" are non case files. # else: #now just the cases #PERP COMPANY
def main(): # read in csv file to extract the email and addresses field # put email and addr into a list of tuples email_addr = [] with open('OECx_PH241x_3T2014_student_profile_info_2014-10-20-1645.csv', 'rU') as f: reader = csv.reader(f) for row in reader: pair = [row[3], row[9]] email_addr.append(pair) # create dictionary to find the country code | to iterate over the dictionary=> for key in d: country_code = {} with open('Country_code.csv', 'rU') as f: reader = csv.reader(f) for row in reader: key = row[0].split(' ', 1)[0].lower() value = row[0].split(' ', 1)[1].lower() country_code[key] = value # make Calais calls to extract country name api_key = 'wukyjrm778py5wry9qdtgk9u' calais = Calais(api_key) # dictionary to store all the results country_count = {} country_count['(TOTAL)'] = 0 country_count['united states'] = 0 country_count['~origin unknown'] = 0 count = 0 for pair in email_addr: check = 0 try: response = {} if pair[1] != '': response = calais.analyze(pair[1]) # if the addr contains country information if hasattr(response, 'entities'): # entry is a list of 3 elements: priority (3 for ProvinceOrState, 2 for Country, 1 for EmailAddress ), Country, Province entry = [-1, '', ''] for each in response.entities: if each['_type'] == 'ProvinceOrState': try: entry[1] = each['resolutions'][0]['containedbycountry'].lower() entry[0] = 3 entry[2] = each['name'].lower() except KeyError: print 'Country name cannot be retrieved' elif each['_type'] == 'Country': if entry[0] < 2: entry[0] = 2 entry[1] = each['name'].lower() elif each['_type'] == 'EmailAddress': if entry[0] < 1: entry[0] = 1 if entry[0] == 3: name = '(US) - ' + entry[2] if entry[1] not in country_count: country_count[entry[1]] = 1 else: country_count[entry[1]] = 1 + country_count[entry[1]] if entry[1] == 'united states': if name not in country_count: country_count[name] = 1 else: country_count[name] = 1 + country_count[name] elif entry[0] == 2: if entry[1] not in country_count: country_count[entry[1]] = 1 else: country_count[entry[1]] = 1 + country_count[entry[1]] elif entry[0] == 1: check = 1 # go through email check else: country_count['~origin unknown'] = country_count['~origin unknown'] + 1 else: check = 1 # if addr is empty, query email address mapping table; if no entry, Unknown add 1 # here we assume that all entries without addr and without strong indication of country origins in their emails will be categorized under the USA entry if check == 1: # determine entry name email_endstr = pair[0].split('.')[-1].lower() if email_endstr in country_code: name = country_code[email_endstr] else: name = '~origin unknown' # add entry if name not in country_count: country_count[name] = 1 else: country_count[name] = country_count[name]+1 except ValueError: print 'Calais could not handle the language' country_count['~origin unknown'] = country_count['~origin unknown'] + 1 count = count +1 print 'Number of entries queried: ' + str(count) country_count['(TOTAL)'] = count country = sorted(country_count) print country us = 0 with open('origin.csv', 'w') as fp: a = csv.writer(fp, delimiter=',') for key in country: if key != 'united states': a.writerow([key, country_count[key]]) if us == 0: a.writerow(['united states', country_count['united states']]) us = 1
def main(): # read in csv file to extract the email and addresses field # put email and addr into a list of tuples email_addr = [] with open('OECx_PH241x_3T2014_student_profile_info_2014-10-20-1645.csv', 'rU') as f: reader = csv.reader(f) for row in reader: pair = [row[3], row[9]] email_addr.append(pair) # create dictionary to find the country code | to iterate over the dictionary=> for key in d: country_code = {} with open('Country_code.csv', 'rU') as f: reader = csv.reader(f) for row in reader: key = row[0].split(' ', 1)[0].lower() value = row[0].split(' ', 1)[1].lower() country_code[key] = value # make Calais calls to extract country name api_key = 'wukyjrm778py5wry9qdtgk9u' calais = Calais(api_key) # dictionary to store all the results country_count = {} country_count['united states'] = 0 count = 0 for pair in email_addr: try: response = {} if pair[1] != '': response = calais.analyze(pair[1]) # if the addr contains country information name = '' if hasattr(response, 'entities'): print response.entities name = response.entities[0]['containedbycountry'].lower() if '@' in name: #where email addresses are wrongly entered as addr last_str = pair[0].split('.')[-1].lower() if last_str in country_code: name = country_code[last_str] else: name = 'united states' if name not in country_count: country_count[name] = 1 else: country_count[name] = country_count[name]+1 else: if name not in country_count: country_count[name] = 1 else: country_count[name] = country_count[name]+1 # otherwise, check the email addr else: last_str = pair[0].split('.')[-1].lower() if last_str in country_code: name = country_code[last_str] else: name = 'united states' if name not in country_count: country_count[name] = 1 else: country_count[name] = country_count[name]+1 except ValueError: print 'Calais could not handle the language' count = count +1 print 'Number of entries queried: ' + str(count) print country_count with open('countrybreakdown.csv', 'w') as fp: a = csv.writer(fp, delimiter=',') for key in country_count: a.writerow([key, country_count[key]])
return process_type = settings.DEFAULT_PROCESS_TYPE if 'contentType' in settings.PROCESSING_DIR: d_proc_type = proc_dir['contentType'] if 'fields' not in params: if settings.ST_DEBUG: raise Exception('No "fields" found.') else: return # Create the instance of Calais and setup the parameters, # see open-calais.com for more information about user directives, # and processing directives c = Calais(settings.API_KEY) c.user_directives.update(settings.USER_DIR) c.processing_directives.update(settings.PROCESSING_DIR) c.processing_directives['contentType'] = process_type processed_tags = [] for item in params['fields']: try: d = item.copy() field = d.pop('name') proc_type = d.pop('process_type', process_type) markup = d.pop('markup', False) data = getattr(obj, field)
score float NOT NULL, PRIMARY KEY (ID) );""") cursor.execute("""CREATE TABLE tweet_entity ( ID int NOT NULL AUTO_INCREMENT, tweet_id bigint(20) NOT NULL, entity varchar(255) NOT NULL, relevance float NOT NULL, PRIMARY KEY (ID) );""") q_topic_insert = "INSERT INTO tweet_topic (tweet_id,topic,score) VALUES (%s,%s,%s);" q_entity_insert = "INSERT INTO tweet_entity (tweet_id,entity,relevance) VALUES (%s,%s,%s);" # Configure Calais connection calais = Calais(config.calais_api_key, submitter=config.calais_user) userprofiles = {} last_time = time.time() for username in usernames: if limit == None: cursor.execute("SELECT id, content FROM tweets_sample WHERE username='******'") else: cursor.execute("SELECT id, content FROM tweets_sample WHERE username='******' LIMIT " + str(limit)) data = cursor.fetchall() analyzed = 0 skipped = 0 topics = {} no_topics = 0
def classify(real_url_hash): API_KEY = "***REMOVED***" ''' # Open database connection db = MySQLdb.connect( host = '127.0.0.1', user = '******', passwd = 'globelab', db = 'condor', port = 3307) ''' try: with open('config-local.json') as fh: config = json.load(fh) except IOError: with open('config.json') as fh: config = json.load(fh) db = MySQLdb.connect( host=config['mysql']['host'], port=config['mysql']['port'], user=config['mysql']['user'], passwd=config['mysql']['password'], db=config['mysql']['database'], use_unicode=True, charset="utf8") db.autocommit(True) #call calais function calais = Calais(API_KEY, submitter="python-calais classify") #run from train set pkfile = open('savedclassifier.pickle','rb') classifier = pickle.load(pkfile) # prepare a cursor object using cursor() method cursor = db.cursor() # Prepare SQL query to INSERT a record into the database. sql = "SELECT embedly_blob,real_url_hash FROM url_info WHERE real_url_hash='" + real_url_hash + "' LIMIT 1;" # Execute the SQL command cursor.execute(sql) # Fetch all the rows results = cursor.fetchall() #browse through results for row in results: #get results real_url_hash = row[1] jsondecode = json.loads(row[0]) title = jsondecode['title'] description = jsondecode['description'] url = jsondecode['url'] if title and description and url: analysetext = title + ' ' + description + ' ' + url else: analysetext = ' ' if title: analysetext = analysetext + ' ' + title if description: analysetext = analysetext + ' ' + description if url: analysetext = analysetext + ' ' + url analysetext.encode("utf8") analysetext = analysetext.encode("utf8") analysetext= analysetext.replace('"', '\'') #classifier 1 #naive bayes _topic = classifier.classify(analysetext) _score = classifier.get_score(analysetext) #classifier 2 #opencalais crosscheck try: result = calais.analyze(analysetext) topic = result.get_topics() score = result.get_topic_score() except: topic = "None" score = 0.0 sqlerr = 'INSERT INTO error_classify (real_url_hash, text) VALUES("' + real_url_hash+ '","' + analysetext + '");' cursor.execute(sqlerr) print real_url_hash print analysetext print topic print _topic #create json output jsonOutput = "{\"topic\":\"%s\" , \"score\":\"%f\" , \"_topic\":\"%s\" , \"_score\":\"%f\"}" % (topic, score, _topic, _score) sqlupdate = "UPDATE url_info SET topic_blob=\'" + jsonOutput + "\' WHERE real_url_hash=\'" + real_url_hash + "\';" x = cursor.execute(sqlupdate) trace = 'trace: updated url_info, url hash [%s]: %d' % (real_url_hash, x) #update score if topic == "Sports": cursor.execute("UPDATE url_info SET sports_score=\'" + str(score) + "\' WHERE real_url_hash=\'" + real_url_hash + "\';") elif topic == "None" and _topic == "sports": cursor.execute("UPDATE url_info SET sports_score=\'" + str(_score) + "\' WHERE real_url_hash=\'" + real_url_hash + "\';") else: cursor.execute("UPDATE url_info SET sports_score='0' WHERE real_url_hash=\'" + real_url_hash + "\';") db.close() pkfile.close()
if leftToWait>0: time.sleep(leftToWait) ret = func(*args,**kargs) lastTimeCalled[0] = time.clock() return ret return rateLimitedFunction return decorate Manager = QueryManager() # Calais Rate limits our analysis requests, # this throttles requests without needing to sleep # AnalysisQueue = queue.Queue(.4); # AnalysisQueue.execute() key = "c3wjfrkfmrsft3r5wgxm5skr" CalaisObj = Calais(key, submitter="Sam Purcell") def pr(*args): print args[0] % (len(args) > 1 and args[1:] or []) sys.stdout.flush() def tryConnection (applyfun): try: return applyfun() except exc.SQLAlchemyError: db.session.rollback() return applyfun() class News(): normalizers = { "feedzilla" : {
from calais import Calais API_KEY = "s5mba8qn5qb4vjmc663qxn8m" calais = Calais(API_KEY, submitter="jannae") result = calais.analyze_file("sotu2012.txt") result.print_summary() def print_entities(self): if not hasattr(self, "entities"): return None for item in self.entities: print "%s: %s (%.2f)" % (item['_type'], item['name'], item['relevance'])
from calais import Calais import os API_KEY = "v5q6rvm7h4uww6sumjxuw9t7" calais = Calais(API_KEY, submitter="python-calais demo") f = open("Text/test.txt", 'r+') NYT = f.read() f.close() result = calais.analyze(NYT) result.print_summary() result.print_entities()
from calais import Calais API_KEY = "dkm645ejqmq7aajt8cp6zxk7" calais = Calais(API_KEY, submitter="python-calais demo") result = calais.analyze( "My 15 year old Daughter has sores in her genital area and her mouth.She swares she did nothing but kiss her boyfriend.She also has flu-like symptoms." ) result.print_summary() result.print_entities() result.print_topics() p = raw_input()