def populate_database_with_semantic_data_from_comments(calais_api_key, db_cursor, debug): calais = Calais(calais_api_key, submitter='usermine') db_cursor.execute('SELECT id, comment FROM comments WHERE updated = 0') for comment_data in db_cursor.fetchall(): id = comment_data[0] comment = comment_data[1] try: result = calais.analyze(comment) if hasattr(result, 'entities'): for entity in result.entities: entity_name = entity['name'] db_cursor.execute('INSERT INTO entities (id, entity) VALUES (NULL, ?)', [entity_name]) if hasattr(result, 'topics'): for topic in result.topics: topic_name = topic['categoryName'] db_cursor.execute('INSERT INTO topics (id, topic) VALUES (NULL, ?)', [topic_name]) except: if debug: print sys.exc_info() db_cursor.execute('UPDATE comments SET updated=1 WHERE id = ?', [id]) if debug: print '.'
def get_calais_subjects(text, uid): registry = getUtility(IRegistry) settings = registry.forInterface(ITagHelperSettingsSchema) api_key = settings.calais_api_key relevance = settings.calais_relevance subjects=[] if api_key: calais = Calais(api_key) try: result = calais.analyze(text, external_id = uid) except: return [] #if hasattr( result, 'entities'): # for entity in result.entities: # if entity['_type'] in PREFERRED_ENTITIES: # subjects.append(entity['name']) if hasattr( result, 'socialTag'): for tag in result.socialTag: if float(tag['importance']) > relevance: subjects.append(tag['name']) #if hasattr( result, 'relations'): # for fact in result.relations: # if fact['_type'] in PREFERRED_FACTS: # ft = fact.get(fact['_type'].lower()) # if ft: # subjects.append(ft) return subjects
def get_calais_subjects(text, uid): registry = getUtility(IRegistry) settings = registry.forInterface(ITagHelperSettingsSchema) api_key = settings.calais_api_key relevance = settings.calais_relevance subjects = [] if api_key: calais = Calais(api_key) try: result = calais.analyze(text, external_id=uid) except: return [] # if hasattr( result, 'entities'): # for entity in result.entities: # if entity['_type'] in PREFERRED_ENTITIES: # subjects.append(entity['name']) if hasattr(result, "socialTag"): for tag in result.socialTag: if float(tag["importance"]) > relevance: subjects.append(tag["name"]) # if hasattr( result, 'relations'): # for fact in result.relations: # if fact['_type'] in PREFERRED_FACTS: # ft = fact.get(fact['_type'].lower()) # if ft: # subjects.append(ft) return subjects
def process_calais(content, key): calais = Calais(key) response = calais.analyze(content) people = [entity["name"] for entity in getattr(response, "entities", []) if entity["_type"] == "Person"] return {"people": people}
def analyze(model, text=None, backend='calais'): from calais import Calais from django.conf import settings calais = Calais(settings.CALAIS_API_KEY, submitter='newscredit') if text: result = calais.analyze(text) else: result = calais.analyze(model.analysis_text()) people = [] for entity in result.entities: if entity['_type'] == 'Person': try: person = Person.objects.get(uri=entity['__reference']) except Person.DoesNotExist, e: person = Person() person.from_calais(entity) person.save() people.append(person)
def analyze(request): API_KEY = 'kgmykdr862hdfhkzkuchnxkc' calais = Calais(API_KEY, 'python') try: result = calais.analyze(request.POST['content']) if result: return HttpResponse(json.dumps(result.get_json_entities()), mimetype="application/json") else: return HttpResponse(json.dumps(""), mimetype="application/json") except(KeyError): return HttpResponse(json.dumps(""), mimetype="application/json")
def contract(request, contract): response = {} collection = connection() blocks = collection.find( { 'contracts':contract}, sort=[('date', ASCENDING)] ) # get last block and display if it was approved or not calais = Calais(settings.CALAIS, submitter="hello world") newbloks = [] for block in blocks: block['entities'] = calais.analyze(block['string']).entities newbloks.append(block) response['items'] = newbloks response['contract'] = contract return render_to_response('contract.html', response)
def analyze(model, text=None, backend='calais'): from calais import Calais from django.conf import settings calais = Calais(settings.CALAIS_API_KEY, submitter='newscredit') if not text: _text = model.analysis_text() else: _text = text # we cannot analyse if our total content is under 100 characters # after HTML cleaning. We leave OpenCalais to do this as they have # advanced heuristics to do it. If our text to analyse is less than # 100 characters, we skip the analysis. if len( _text ) < 100: return result = calais.analyze(_text) records = [] try: for entity in result.entities: if entity['_type'] == 'Person': _model = Person elif entity['_type'] in [ 'City', 'Country', 'Continent', 'Facility', 'Region' ]: _model = Place elif entity['_type'] in [ 'Organization', 'Company' ]: _model = Organisation else: continue try: _record = _model.objects.get(uri=entity['__reference']) except _model.DoesNotExist, e: _record = _model() _record.from_calais(entity) _record.save() model.add_entity(_record) records.append(_record) except AttributeError: # this happens if Calais throws an error. To ensure we continue # processing other records pass this error and return False return False return result, records
class classifier(): API_KEY = 'xyby6x47ycxj56bkkb83s9he' def __init__(self): self.calais = Calais(self.API_KEY, submitter='SocialTV Demo') def process(self, ip, text): result = self.calais.analyze(text) all_tags = [] if hasattr(result, 'entities'): all_tags.extend(result.entities) if hasattr(result, 'socialTag'): #all_tags.extend(result.socialTag) pass if len(all_tags) == 0: return [] output = [] for tag in all_tags: if tag['name'].lower() not in dirtyWords.DIRTY: qwiki = self._get_qwiki_url(tag['name']) if len(qwiki) > 0: img = self._get_google_image(ip, tag['name']) print "Tag: "+tag['name'] output.append({'text':text ,'tag': tag['name'], 'qwiki':qwiki, 'img': img}) return output def _get_qwiki_url(self, text): try: tmp_txt = text.replace(' ', '_') response = urllib2.urlopen("http://www.qwiki.com/embed/"+urllib2.quote(tmp_txt)+"?autoplay=true") return "http://www.qwiki.com/embed/"+urllib2.quote(tmp_txt)+"?autoplay=true" except urllib2.URLError, e: response = urllib2.urlopen("http://embed-api.qwiki.com/api/v1/search.json?count=1&q="+urllib2.quote(text)) html = response.read() html_eval = json.loads(html) if len(html_eval) > 0: return self._process_qwiki_results(text, html_eval) else: return []
def main(): """ Main method """ if not sys.stdin: usage() sys.exit(1) input = sys.stdin.read() # read from stdin input = input.replace("<", "") input = input.replace(">", "") api_key = config.read_config('calais', 'api_key') calais = Calais(api_key, submitter="fridakahlo") entities = calais.analyze(input).entities #for e in entities: print e['name'], len(e['instances']), '<br>' linked_text = add_links(entities, input) for line in linked_text.splitlines(): print "<p>", line, "<p>"
def entities(env, start_response): """Extracts entities from resume utilizing the OpenCalais webservice.""" start_response('200 OK', [('Content-Type', 'text/xml')]) API_KEY = "kqyhhfppufvmvxspkstwjxw5" calais = Calais(API_KEY, submitter="resume_analysis") try: with open('Darin_Plutchok_Resume_Taxonomist.txt') as f: text = f.read() except: raise restlite.Status, '400 Error Reading File' try: results = calais.analyze(text) except Exception as e: return "<error>%s</error>" % e entities_tuples = [(entity['name'], entity['_type'], entity['relevance']) for entity in results.entities] doc = create_xml({'entities': entities_tuples}) return [str(doc.toxml())]
class calaisApi: def __init__(self): self.calais = Calais(KEY, submitter="GodelTest") @persistent_memoize def calais_run(self, sentence): entities = [] try: result = self.calais.analyze(sentence) except ValueError: return if hasattr(result, "entities"): if len(result.entities) > 0: for results in result.entities: entities.append(results) if len(entities) > 0: return entities else: return False
def _get_people(text): ''' Runs input text through Calais to extract people, coreferences and their locations. This function returns a canonical name for any given source in the document and contextual information about where coreferences appear, based on the text before and after the pronoun occurrance. Takes full story text as input. This is a pretty bare-bones function. It doesn't handle Calais API errors, so it tends to crap out from time to time. Future refinements should account for this. ''' # Run input text through Calais calais = Calais(API_KEY, submitter="tbc-coref-test") annotations = calais.analyze(text) # If no entities come back, peace out if not hasattr(annotations, 'entities'): return False coref = {} # Dictionary to hold our corefence object. for e in annotations.entities: instances = [] # We only care about Person entities, not companies, places, etc. if e['_type'] == 'Person': # For each instance of that entity (which includes pronouns and other references) ... for i in e['instances']: # Collect the coreference text (exact) the preceding text (prefix) and the # following text (suffix) for reference information. We'll need this later. instances.append( (i.get('exact'), i.get('suffix', ''), i.get('prefix', ''))) # Associate the canonical name with the coreference and context information gathered # above for use later. name = e.get("commonname", e.get('name', None)) coref[name] = instances return coref
def _get_people(text): ''' Runs input text through Calais to extract people, coreferences and their locations. This function returns a canonical name for any given source in the document and contextual information about where coreferences appear, based on the text before and after the pronoun occurrance. Takes full story text as input. This is a pretty bare-bones function. It doesn't handle Calais API errors, so it tends to crap out from time to time. Future refinements should account for this. ''' # Run input text through Calais calais = Calais(API_KEY, submitter="tbc-coref-test") annotations = calais.analyze(text) # If no entities come back, peace out if not hasattr(annotations, 'entities'): return False coref = {} # Dictionary to hold our corefence object. for e in annotations.entities: instances = [] # We only care about Person entities, not companies, places, etc. if e['_type'] == 'Person': # For each instance of that entity (which includes pronouns and other references) ... for i in e['instances']: # Collect the coreference text (exact) the preceding text (prefix) and the # following text (suffix) for reference information. We'll need this later. instances.append((i.get('exact'), i.get('suffix', ''), i.get('prefix', ''))) # Associate the canonical name with the coreference and context information gathered # above for use later. name = e.get("commonname", e.get('name', None)) coref[name] = instances return coref
from calais import Calais API_KEY = "s5mba8qn5qb4vjmc663qxn8m" calais = Calais(API_KEY, submitter="jannae") result = calais.analyze("George Bush was the President of the United States of America until 2009. Barack Obama is the new President of the United States now.") result.print_summary()
from calais import Calais import collections #set Calais API Key and create new instance API_KEY = "g8gnzpdz52gkwyduv75zecem" calais = Calais(API_KEY, submitter="python-calais demo") #demo text input_text = "George Bush was the President of the United States of America until 2009. Barack Obama is the new President of the United States now." with open('stdin_1.txt', 'r') as f: input_text = f.read() f.closed result = calais.analyze(input_text) #result.print_entities() #initialize dictionary that will contain the linked data link_list = {} #initialize detected references count (collected for assignment) detected_count = 0 #loop through each entity and assign a link for i in range(len(result.entities)): if 'resolutions' in result.entities[i]: #if Calais has assigned an RDF value, use that as the link name = result.entities[i]['name'] link = result.entities[i]['resolutions'][0]['id'] link_list[name] = link
analyzed = 0 skipped = 0 topics = {} no_topics = 0 named_entities = {} no_named_entities = 0 for tweet in data: try: analyzed += 1 # Wait until wait time is passed and let Calais analyze a new tweet time_passed = time.time() - last_time wait_time = max(wait_per_request - time_passed, 0) time.sleep(wait_time) result = calais.analyze(tweet[1]) last_time = time.time() # Extract entities try: for entity in result.entities: name = entity["name"] if (name == u'RT') | (entity["_type"] == u'URL'): continue cursor.execute(q_entity_insert, (str(tweet[0]), name, str(entity["relevance"]))) db.commit() if name in named_entities: named_entities[name] += 1 else: named_entities[name] = 1 except AttributeError:
def analyze_url(request): # Insert the article into the database if it's new a, created = Article.objects.get_or_create( url = request.POST['url'] ) a.save() # List of tags for UI tag_list = [] # If the article is new, then get it's tags if created: # Use Goose to get the article text g = Goose() article_text = g.extract(url = a.url).cleaned_text.encode('utf-8') # Use python-calais to analyze the article text calais = Calais(settings.OC_API_KEY) result = calais.analyze(article_text) try: # Create a dictionary to store all the entities and relevance scores # Get rid of all duplicates entity_dict = dict([]) for entity in result.entities: if not entity_dict.has_key(entity['name']): entity_dict[entity['name']] = entity['relevance'] # Sort the entities and topics by confidence/relavence entity_list = sorted(entity_dict.iteritems(), key=itemgetter(1), reverse=True) # Insert the OpenCalais entities and topics with confidence > .3 into the database and add to a list for UI for tag, confidence in entity_list: if confidence > .3: t = Tag( article = a, tag = tag, confidence = confidence, service = Tag.OPEN_CALAIS, ) t.save() tag_list.append(t) else: break except KeyError: print 'No entities found' try: # Create a dictionary to store all the topics and relevance scores # Get rid of all duplicates topic_dict = dict() for topic in result.simplified_response['topics']: if not topic_dict.has_key(topic['categoryName']): topic_dict[topic['categoryName']] = topic['score'] # Sort topics by relavence topic_list = sorted(topic_dict.iteritems(), key=itemgetter(1), reverse=True) for tag, confidence in topic_list: if confidence > .3: t = Tag( article = a, tag = tag, confidence = confidence, service = Tag.OPEN_CALAIS, ) t.save() tag_list.append(t) else: break except KeyError: print 'No topics found' ### # TODO: Add AlchemyAPI support here ### # If the article is old, just get the tags from the database else: tag_list = list(Tag.objects.filter(article = a)) shuffle(tag_list) # Context for the template c = { 'tags': tag_list } return render(request, 'tag_list.html', c)
#feature counter >= 2 works feature_counter = 0 for entry in sportslist: if analysetext.lower().find(entry.lower()) >= 0: feature_counter += 1 #classifier 1 #naive bayes _topic = classifier.classify(analysetext) _score = classifier.get_score(analysetext) #classifier 2 #opencalais crosscheck try: result = calais.analyze(analysetext) topic = result.get_topics() score = result.get_topic_score() except: topic = "None" score = 0.0 classifier_json = json.dumps({'topic':topic,'score':score,'_topic':_topic,'_score':_score}) if topic == "Sports" and feature_counter < 2: sports_score = str(score) elif topic == "None" and _topic == "sports" and feature_counter < 2: sports_score = str(_score) elif feature_counter >= 2: sports_score = '1.0' else: sports_score = '0.0'
htmlname = "Text/mirror.html"; elif fid ==4: fname = "Text/independent.txt"; htmlname = "Text/independent.html"; else: fname = "Text/dailymail.txt"; htmlname = "Text/dailymail.html"; f = open(fname, 'r+') article = f.read() f.close() filter = ['Currency', 'IndustryTerm' , 'MedicalCondition'] result = calais.analyze(article) #result.print_summary() result.print_entities() article = article.decode('ascii', 'replace').encode('ascii', 'replace') enriched_article = article; for entity in result.entities: if entity['_type'] not in filter: name = entity['name'] print "Resolving " + name name = name.decode('ascii', 'replace').encode('ascii', 'replace') str = "" try: link = entity['resolutions'][0]['id']
def get_entities(content): API_KEY = os.environ['API_KEY'] calais = Calais(API_KEY, submitter="python-calais demo") result = calais.analyze(content) result.print_entities()
def web_extract_terms(text, raw_query='',service='tagthe'): """ Given a text, extract keyword terms with the selected web_service. Args: text: raw text from where to extract terms query: a query which may contextualize the extraction (only used by yahoo) service: which web service to use Returns: query: a sequence of query terms """ service = service.lower().strip() if not service in WEB_SERVICES.keys(): raise ValueError('%s is an invalid web service, possible choices are %s' % (service, repr(WEB_SERVICES.keys()))) #1. Build the query: query = {} apikey = settings.WEB_SERVICES_KEYS.get(service, '') if service == 'wordsfinder': query = { 'apikey' : apikey, 'context': text + raw_query, } elif service == 'alchemy': query = { 'apikey' : apikey, 'text' : text + raw_query, 'outputMode' : 'json' } elif service == 'yahoo': query = { 'appid': apikey, 'context': text, 'output': 'json', } if raw_query: query.update({'query': raw_query}) elif service == 'tagthe': query = { 'text': text + raw_query, 'view': 'json', } elif service == 'opencalais': #use the python interface, obtained from: #http://www.opencalais.com/applications/python-calais-python-interface-opencalais-api #logging.debug('Using the opencalais interface with key %s' % apikey) s = Calais(apikey) try: res = s.analyze(text + raw_query) except: raise WebServiceException(service, 'error in request') #logging.debug('The raw response: %s' % res.raw_response) if hasattr(res, 'topics') or hasattr(res, 'entities'): retval = [t['categoryName'] for t in res.topics] if hasattr(res, 'topics') else [] retval += [t['name'] for t in res.entities] if hasattr(res, 'entities') else [] return retval else: #raise WebServiceException(service, 'No topics or entities found') #logging.info("OpenCalais didn't return topics|entities for %s" %text) return ["",] # elif service == 'extractor': # #use the python interface # #logging.debug('using the extractor interface with key %s' % apikey) # extractor=ExtractorService(key=apikey, url=WEB_SERVICES[service]) # raw_response = extractor.extract(text + raw_query) # #logging.debug('The raw response: %s' % raw_response) # # if raw_response.get('ExtractionStatus') == '-1': # print "failure!" # raise WebServiceException(service, "Failure in request") # else: # #TODO: what DOES it return? # return raw_response #2. Try to call the service: resp = None #logging.debug('requesting %s' % WEB_SERVICES[service]+'?%s'%urlencode(query)) try: #HACK: tagthe has issues with POST requests, so try and do a GET #max length for a GET request is 2048: #http://stackoverflow.com/questions/1344616/max-length-of-query-string-in-an-ajax-get-request if service == 'tagthe': # and len(urlencode(query)) <= 2048: resp_url = urlopen(WEB_SERVICES[service]+'?%s'%urlencode(query)) else: #HACK: fallback to yahoo if the request is too much for tagthe #service = 'yahoo' if service == 'tagthe' else service resp_url = urlopen(WEB_SERVICES[service], urlencode(query)) resp = resp_url.read() #this causes the exception... #logging.debug( u"%s returned %s" % (service, resp)) except Exception as e: #TODO: retry in timeouts and stuff #logging.debug('Error in request: %s' % e, exc_info = True) raise WebServiceException(service, 'Error in request to service : %s' % e) #3. Process the response: if resp: result = '' if service == 'alchemy': data = json.loads(resp) if data['status'] == 'ERROR': raise WebServiceException(service, 'call returned error status') result = [re.sub('-[^ ] ', '', e['text']) for e in data['keywords']] elif service == 'yahoo': data = json.loads(resp) result = data['ResultSet']['Result'] elif service == 'wordsfinder': parsed_response = parseString(resp) e = parsed_response.getElementsByTagName('error') if e: raise WebServiceException(service, 'error code %s' % e.firstChild.datad) else: result = [node.firstChild.data for node in parsed_response.getElementsByTagName('keyword')] elif service == 'tagthe': data = json.loads(resp) if 'memes' in data and 'dimensions' in data['memes'][0] and 'topic' in data['memes'][0]['dimensions']: result = data['memes'][0]['dimensions']['topic'] #logging.debug(u'tagthe result %s' %result) else: result = ['', ] return [unescape(w) for w in result] # TODO: maybe find a way to call 'em all and have a super-set of kws? else: return ''
n = sib #PERP - INDIVIDUALS if 'ENTITIES/INDIVIDUALS INVOLVED' in soup.find_all('h2')[i]: next = True n = soup.find_all('h2')[i] perp_indivs = [] perp_rank = [] while next == True: sib = n.next_sibling if sib.name == 'h2': next = False elif sib.name != 'br' and sib != '\n' not in sib: #test to see if it's a person using calais r = calais.analyze(sib.encode('utf-8').strip()) if hasattr(r, 'entities'): for j in range(len(r.entities)): if r.entities[j]['_type'] == 'Person': perp_indivs.append(r.entities[j]['name']) if r.entities[j]['_type'] == 'Position': position = r.entities[j]['name'] #PERP RANK fit = False if 'President' in position or 'Chairman' in position or 'CEO' in position or 'Chief Executive Officer' in position: if 'CEO/Chairman/President' not in perp_rank: perp_rank.append( 'CEO/Chairman/President') fit = True if 'Founder' in position:
_topic = "None" _score = 0 ''' rate limit issues Submission Rate (per second), 4, ''' time.sleep( 0.3 ) ''' classifier 2 ''' #opencalais crosscheck print '================' if real_url_hash in ('e3cb4b6333506ffe5d746c9fabe32a00f9513f46', '76a134d8cb993198d89da02889592d3984a05861', 'c55fc7c88b1d9c0a58a1f29e44f9e0a69f1cc3fa'): result = calais.analyze("None") else: result = calais.analyze(analysetext) topic = result.get_topics() score = result.get_topic_score() if topic: print 'OC topic:' + topic print 'OC score:%f'% (score*100) else: topic = "None" score = 0 print '================' jsonOutput = "{\"topic\":\"%s\" , \"score\":\"%f\" , \"_topic\":\"%s\" , \"_score\":\"%f\"}" % (topic, score, _topic, _score) print jsonOutput sqlupdate = "UPDATE condor.url_info SET topic_blob=\'" + jsonOutput + "\' WHERE real_url_hash=\'" + real_url_hash + "\';"
def classify(real_url_hash): API_KEY = "***REMOVED***" ''' # Open database connection db = MySQLdb.connect( host = '127.0.0.1', user = '******', passwd = 'globelab', db = 'condor', port = 3307) ''' try: with open('config-local.json') as fh: config = json.load(fh) except IOError: with open('config.json') as fh: config = json.load(fh) db = MySQLdb.connect( host=config['mysql']['host'], port=config['mysql']['port'], user=config['mysql']['user'], passwd=config['mysql']['password'], db=config['mysql']['database'], use_unicode=True, charset="utf8") db.autocommit(True) #call calais function calais = Calais(API_KEY, submitter="python-calais classify") #run from train set pkfile = open('savedclassifier.pickle','rb') classifier = pickle.load(pkfile) # prepare a cursor object using cursor() method cursor = db.cursor() # Prepare SQL query to INSERT a record into the database. sql = "SELECT embedly_blob,real_url_hash FROM url_info WHERE real_url_hash='" + real_url_hash + "' LIMIT 1;" # Execute the SQL command cursor.execute(sql) # Fetch all the rows results = cursor.fetchall() #browse through results for row in results: #get results real_url_hash = row[1] jsondecode = json.loads(row[0]) title = jsondecode['title'] description = jsondecode['description'] url = jsondecode['url'] if title and description and url: analysetext = title + ' ' + description + ' ' + url else: analysetext = ' ' if title: analysetext = analysetext + ' ' + title if description: analysetext = analysetext + ' ' + description if url: analysetext = analysetext + ' ' + url analysetext.encode("utf8") analysetext = analysetext.encode("utf8") analysetext= analysetext.replace('"', '\'') #classifier 1 #naive bayes _topic = classifier.classify(analysetext) _score = classifier.get_score(analysetext) #classifier 2 #opencalais crosscheck try: result = calais.analyze(analysetext) topic = result.get_topics() score = result.get_topic_score() except: topic = "None" score = 0.0 sqlerr = 'INSERT INTO error_classify (real_url_hash, text) VALUES("' + real_url_hash+ '","' + analysetext + '");' cursor.execute(sqlerr) print real_url_hash print analysetext print topic print _topic #create json output jsonOutput = "{\"topic\":\"%s\" , \"score\":\"%f\" , \"_topic\":\"%s\" , \"_score\":\"%f\"}" % (topic, score, _topic, _score) sqlupdate = "UPDATE url_info SET topic_blob=\'" + jsonOutput + "\' WHERE real_url_hash=\'" + real_url_hash + "\';" x = cursor.execute(sqlupdate) trace = 'trace: updated url_info, url hash [%s]: %d' % (real_url_hash, x) #update score if topic == "Sports": cursor.execute("UPDATE url_info SET sports_score=\'" + str(score) + "\' WHERE real_url_hash=\'" + real_url_hash + "\';") elif topic == "None" and _topic == "sports": cursor.execute("UPDATE url_info SET sports_score=\'" + str(_score) + "\' WHERE real_url_hash=\'" + real_url_hash + "\';") else: cursor.execute("UPDATE url_info SET sports_score='0' WHERE real_url_hash=\'" + real_url_hash + "\';") db.close() pkfile.close()
processed_tags = [] for item in params['fields']: try: d = item.copy() field = d.pop('name') proc_type = d.pop('process_type', process_type) markup = d.pop('markup', False) data = getattr(obj, field) data = force_unicode(getattr(obj, field)) # Analyze the text (data) result = c.analyze(data) # Retrieve the Django content type for the obj ctype = ContentType.objects.get_for_model(obj) # Remove existing items, this ensures tagged items are updated correctly SuperTaggedItem.objects.filter(content_type=ctype, object_id=obj.pk, field=field).delete() if settings.PROCESS_RELATIONS: SuperTaggedRelationItem.objects.filter(content_type=ctype, object_id=obj.pk, field=field).delete() entities, relations, topics = [], [], [] # Process entities, relations and topics if hasattr(result, 'entities'): entities = _processEntities(field, result.entities, obj, ctype, proc_type, tags) if hasattr(result, 'relations') and settings.PROCESS_RELATIONS: relations = _processRelations(field, result.relations, obj, ctype, proc_type, tags)
fout = open(("results/" + filename + ".html"), "w") fout.write('<html>') fout.write('\r\n') fout.write('<head><title>' + filename + '</title></head>') fout.write('\r\n') fout.write('<body>') with open(("articles/" + filename + ".txt"), "r") as myfile: sys.stdin = myfile content = "" for line in sys.stdin: content += line API_KEY = "f7vhuv2kt4fxufuvv6eznwpe" calais = Calais(API_KEY, submitter="python-calais newsparser") result = calais.analyze(content) print "Summary of the Calais Analysis" result.print_summary() print "Entity of the Calais Analysis" result.print_entities() i = 0 temp = [] entityList = [] html = [] for entity in result.entities: if result.entities[i]["_type"] in [ "City", "Company", "Country", "Movie", "Organization", "Person"
from calais import Calais API_KEY = "k6s6cewwwc5zkemjqpw7yhru" calais = Calais(API_KEY, submitter="python-calais demo") result = calais.analyze("michelle obama") print result.print_summary() print result.print_topics() print result.print_relations()
def main(): # read in csv file to extract the email and addresses field # put email and addr into a list of tuples email_addr = [] with open('OECx_PH241x_3T2014_student_profile_info_2014-10-20-1645.csv', 'rU') as f: reader = csv.reader(f) for row in reader: pair = [row[3], row[9]] email_addr.append(pair) # create dictionary to find the country code | to iterate over the dictionary=> for key in d: country_code = {} with open('Country_code.csv', 'rU') as f: reader = csv.reader(f) for row in reader: key = row[0].split(' ', 1)[0].lower() value = row[0].split(' ', 1)[1].lower() country_code[key] = value # make Calais calls to extract country name api_key = 'wukyjrm778py5wry9qdtgk9u' calais = Calais(api_key) # dictionary to store all the results country_count = {} country_count['(TOTAL)'] = 0 country_count['united states'] = 0 country_count['~origin unknown'] = 0 count = 0 for pair in email_addr: check = 0 try: response = {} if pair[1] != '': response = calais.analyze(pair[1]) # if the addr contains country information if hasattr(response, 'entities'): # entry is a list of 3 elements: priority (3 for ProvinceOrState, 2 for Country, 1 for EmailAddress ), Country, Province entry = [-1, '', ''] for each in response.entities: if each['_type'] == 'ProvinceOrState': try: entry[1] = each['resolutions'][0]['containedbycountry'].lower() entry[0] = 3 entry[2] = each['name'].lower() except KeyError: print 'Country name cannot be retrieved' elif each['_type'] == 'Country': if entry[0] < 2: entry[0] = 2 entry[1] = each['name'].lower() elif each['_type'] == 'EmailAddress': if entry[0] < 1: entry[0] = 1 if entry[0] == 3: name = '(US) - ' + entry[2] if entry[1] not in country_count: country_count[entry[1]] = 1 else: country_count[entry[1]] = 1 + country_count[entry[1]] if entry[1] == 'united states': if name not in country_count: country_count[name] = 1 else: country_count[name] = 1 + country_count[name] elif entry[0] == 2: if entry[1] not in country_count: country_count[entry[1]] = 1 else: country_count[entry[1]] = 1 + country_count[entry[1]] elif entry[0] == 1: check = 1 # go through email check else: country_count['~origin unknown'] = country_count['~origin unknown'] + 1 else: check = 1 # if addr is empty, query email address mapping table; if no entry, Unknown add 1 # here we assume that all entries without addr and without strong indication of country origins in their emails will be categorized under the USA entry if check == 1: # determine entry name email_endstr = pair[0].split('.')[-1].lower() if email_endstr in country_code: name = country_code[email_endstr] else: name = '~origin unknown' # add entry if name not in country_count: country_count[name] = 1 else: country_count[name] = country_count[name]+1 except ValueError: print 'Calais could not handle the language' country_count['~origin unknown'] = country_count['~origin unknown'] + 1 count = count +1 print 'Number of entries queried: ' + str(count) country_count['(TOTAL)'] = count country = sorted(country_count) print country us = 0 with open('origin.csv', 'w') as fp: a = csv.writer(fp, delimiter=',') for key in country: if key != 'united states': a.writerow([key, country_count[key]]) if us == 0: a.writerow(['united states', country_count['united states']]) us = 1
def main(): # read in csv file to extract the email and addresses field # put email and addr into a list of tuples email_addr = [] with open('OECx_PH241x_3T2014_student_profile_info_2014-10-20-1645.csv', 'rU') as f: reader = csv.reader(f) for row in reader: pair = [row[3], row[9]] email_addr.append(pair) # create dictionary to find the country code | to iterate over the dictionary=> for key in d: country_code = {} with open('Country_code.csv', 'rU') as f: reader = csv.reader(f) for row in reader: key = row[0].split(' ', 1)[0].lower() value = row[0].split(' ', 1)[1].lower() country_code[key] = value # make Calais calls to extract country name api_key = 'wukyjrm778py5wry9qdtgk9u' calais = Calais(api_key) # dictionary to store all the results country_count = {} country_count['united states'] = 0 count = 0 for pair in email_addr: try: response = {} if pair[1] != '': response = calais.analyze(pair[1]) # if the addr contains country information name = '' if hasattr(response, 'entities'): print response.entities name = response.entities[0]['containedbycountry'].lower() if '@' in name: #where email addresses are wrongly entered as addr last_str = pair[0].split('.')[-1].lower() if last_str in country_code: name = country_code[last_str] else: name = 'united states' if name not in country_count: country_count[name] = 1 else: country_count[name] = country_count[name]+1 else: if name not in country_count: country_count[name] = 1 else: country_count[name] = country_count[name]+1 # otherwise, check the email addr else: last_str = pair[0].split('.')[-1].lower() if last_str in country_code: name = country_code[last_str] else: name = 'united states' if name not in country_count: country_count[name] = 1 else: country_count[name] = country_count[name]+1 except ValueError: print 'Calais could not handle the language' count = count +1 print 'Number of entries queried: ' + str(count) print country_count with open('countrybreakdown.csv', 'w') as fp: a = csv.writer(fp, delimiter=',') for key in country_count: a.writerow([key, country_count[key]])
import io,sys from calais import Calais API_KEY = "rg72c9882ypfjm24tvrfu6ab" calais = Calais(API_KEY, submitter="python-calais demo") ##read text file with open('file3.txt', 'r') as content_file: content = content_file.read() ##perform analysis on text result = calais.analyze(content) result.print_entities() html_file = open("HTMLFile3.html", "w") ##the resulting entities obtained are not sorted, so we sort them here: def comp(x,y): return y['instances'][0]['offset'] - x['instances'][0]['offset'] sorted_results = sorted (result.entities,cmp=comp) b = content.decode("utf-8") for i in range (len(sorted_results)): #for each entity offset = sorted_results[i]['instances'][0]['offset'] #find offset length = sorted_results[i]['instances'][0]['length'] #find length total = offset + length #find total length if offset != sorted_results[i-1]['instances'][0]['offset']: #to prevent same words being linked twice if 'resolutions' in sorted_results[i]: #if rdf document exists link = sorted_results[i]['resolutions'][0]['id'] data = "<a href = \"" + link + "\" target=\"_blank\">" + sorted_results[i]['name'] + "</a>"
from calais import Calais API_KEY = "djgq52vv8uufzykmnb9g7myv" calais = Calais(API_KEY, submitter ="python-calais demo") result = calais.analyze('''Microsoft is a big company. George Bush was the President of the United States of America until 2009. Barack Obama is the new President of the United States now.''') result2 = calais.analyze('''Microsoft is a big company''') result3 = calais.analyze('''Troubled drug development company SFBC International said on Monday it has changed its name to PharmaNet Development Group Inc. and will be traded on Nasdaq under the stock symbol "PDGI".''') d={} a=[] for i in result3.entities: if i["_type"]=="Technology": a.append(i["name"]) d["Technology"]=a; print d
from calais import Calais import os API_KEY = "v5q6rvm7h4uww6sumjxuw9t7" calais = Calais(API_KEY, submitter="python-calais demo") f = open("Text/test.txt", 'r+') NYT = f.read() f.close() result = calais.analyze(NYT) result.print_summary() result.print_entities()
from calais import Calais API_KEY = "dkm645ejqmq7aajt8cp6zxk7" calais = Calais(API_KEY, submitter="python-calais demo") result = calais.analyze( "My 15 year old Daughter has sores in her genital area and her mouth.She swares she did nothing but kiss her boyfriend.She also has flu-like symptoms." ) result.print_summary() result.print_entities() result.print_topics() p = raw_input()