def geocode_address(address): # use the address and convert it to coords data = dstk.DSTK({'apiBase': 'http://localhost:8080/'}) data.api_base = 'http://localhost:8080/' try: good_address = data.street2coordinates(address) # usage would be good_address[address]['confidence'] # returns the following fields in the dict: # __len__ 12 # confidence 0.878 # country_code u'US' # country_code3 u'USA' # country_name u'United States' # fips_county u'25015' # latitude 42.37572 # locality u'Amherst' # longitude -72.509542 # region u'MA' # street_address u'534 Main St' # street_name u'Main St' # street_number u'534' if good_address[address] != None: return good_address else: print("Address was not valid:" + address) return "INVALID_ADDRESS" except: print("Address was not valid:" + address) return "INVALID_ADDRESS"
def __init__(self, suffixes=None, cities=None, streets=None, zips=None, logger=None, backend="default",\ dstk_api_base=None, required_confidence=0.65): """ suffixes, cities and streets provide a chance to use different lists than the provided lists. suffixes is probably good for most users, unless you have some suffixes not recognized by USPS. cities is a very expansive list that may lead to false positives in some cases. If you only have a few cities you know will show up, provide your own list for better accuracy. If you are doing addresses across the US, the provided list is probably better. streets can be used to limit the list of possible streets the address are on. It comes blank by default and uses positional clues instead. If you are instead just doing a couple cities, a list of all possible streets will decrease incorrect street names. Valid backends include "default" and "dstk". If backend is dstk, it requires a dstk_api_base. Example of dstk_api_base would be 'http://example.com'. """ super(AddressParser, self).__init__(suffixes, cities, streets, zips, logger) self.dstk_api_base = dstk_api_base self.required_confidence = required_confidence if backend == "dstk": if dstk_api_base is None: raise ValueError("dstk_api_base is required for dstk backend.") self.dstk = dstk.DSTK({'apiBase': dstk_api_base}) elif backend == "default": pass else: raise ValueError("backend must be either 'default' or 'dstk'.")
def __init__(self, options, address_gen): """Batch geocode addresses using DSTK. :param url: URL to a DTSK server :param address_gen: A generator that yields tuples of (address, object), where address is an address string. The address is geocoded, and the object is passed thorugh to the result. :return: """ import dstk if isinstance(options, basestring): # Single string, not an options dict options = {'apiBase': options} self.gen = address_gen self.dstk_client = dstk.DSTK(options)
def get_comments(): r = praw.Reddit('Whiteknight scrapping reddit for nasty comments' 'Url: https://github.com/whiteknightinc/white-knight') top_posts = r.get_subreddit('whiteknighttest').get_top(limit=10) comments_with_keywords = [] toolkit = dstk.DSTK() # f = open("swearWordsValue.txt") # keywords = {} # for line in f: # word, val = line.rstrip().split(",") # keywords[word] = int(val) # f.close() for top_post in top_posts: submission = r.get_submission(submission_id=top_post.id) submission.replace_more_comments(limit=32, threshold=0) all_comments = submission.comments comments = praw.helpers.flatten_tree(all_comments) for comment in comments: words = comment.body response = toolkit.text2sentiment(words.encode('utf-8')) if response['score'] <= -3: print comment.body comments_with_keywords.append(comment) # for keyword in keywords.keys(): # count = words.count(keyword) # if count > 0: # score += count * (keywords.get(keyword)) # if score >= 10: # comments_with_keywords.append(comment) # break result = {} for num in range(len(comments_with_keywords)): result[num] = {} result[num]['text'] = comments_with_keywords[num].body result[num]['user'] = comments_with_keywords[num].author.name result[num]['permalink'] = comments_with_keywords[num].permalink return result
#!/usr/bin/env python # # A test case to reproduce https://github.com/petewarden/dstk/issues/4 # # Calls the street2coordinates API repeatedly until it fails import dstk dstk = dstk.DSTK() counter = 0 while True: test_input = '2543 Graystone Place, Simi Valley, CA 93065' result = dstk.street2coordinates(test_input) counter += 1 print str(counter)
class Article(Base): '''Stores received and computed article data.''' __tablename__ = 'articles' id = Column(Integer, primary_key=True) places = Column(TextPickleType(pickler=json)) sentiment = Column(Integer) last_referenced = Column(DateTime, default=datetime.datetime.now) dstk = dstk.DSTK() def __init__(self): self.sentiment = 0 def extract(self, allowance): '''Check to see if the article is in the database. If it is, get the old data. If it's not, then extract the data and decrement the allowance.''' query = session.query(Article).get(self.id) if (query != None): self.places = query.places self.sentiment = query.sentiment query.last_referenced = datetime.datetime.now() return "Cached" else: if (allowance > 0): print "Running extraction =>", # response=urllib2.urlopen(self.source) # html=response.read() # target=self.dstk.html2story(html) # target=target['story'] apiSummary=self.title + ' ' + self.summary # target = apiSummary+ ' ' + target target = apiSummary target = target.encode('ascii', 'ignore') apiSummary = apiSummary.encode('ascii', 'ignore') #replace U.S. with United States!!! target=target.replace ("U.S.", "United States") target=target.replace ("U.S.A", "United States") target=target.replace ("America", "United States") target=target.replace ("Obama", "United States") target=target.replace ("U.K.", "England") target=target.replace ("Britain", "England") target=target.replace ("England", "England") target=target.replace ("London", "England") target=target.replace ("Kim Jong-Un", "Democratic Republic of Korea") self.places = self.dstk.text2places(target) self.sentiment = int(self.dstk.text2sentiment(apiSummary)['score']) session.add(self) print "Done." return "Extracted" else: return "Remove" def to_json(self): a = self a.countries = [] a.long= "" a.lat= "" for place in a.places: if place['type'] == "COUNTRY": try: a.countries.append(two2three[place['code']]) except KeyError, e: # The EU is not a country. pass # Remove duplicates a.countries = list(set(a.countries)) # print place['longitude'] # a.long +=place['longitude'] + "," # a.lat +=place['latitude'] + "," return {'aid':a.id, 'title':a.title, 'summary':a.summary, 'sentiment':a.sentiment, 'link':a.source, 'countries':a.countries, 'long':a.long, 'lat':a.lat, 'source':a.trueSource}
def __init__(self, *args, **kwargs): super(Command, self).__init__(*args, **kwargs) self.r = praw.Reddit('user-agent') self.sub = self.r.get_subreddit('earthporn') self.dstk = dstk.DSTK()
import dstk dstk = dstk.DSTK({'checkVersion': False}) print dstk.street2coordinates('2543 Graystone Place, Simi Valley, CA 93065')import dstk dstk = dstk.DSTK({'checkVersion': False}) print dstk.street2coordinates('2543 Graystone Place, Simi Valley, CA 93065')
## Get console output: reservations = ec2.get_all_instances() instances = [i for r in reservations for i in r.instances] this_instance = instances[-1] time.sleep(180) for r in ec2.get_all_instances(): if r.id == reservation.id: break this_instance = r.instances[0] dns_name = this_instance.public_dns_name base_url = "http://" + dns_name #+ "/maps/api/geocode/json?sensor=false&address=" ## Instantiate the dstk instance dstk_endpoint = dstk.DSTK({'apiBase': base_url}) ## Load the iso data iso_codes = pd.read_csv( './data/iso_country_code_names.txt', sep=';', names=['country_name', 'country_code'], na_values=[], ) iso_codes['country_code'][iso_codes['country_name'] == 'NAMIBIA'] = 'NA' ## Walk across the files and geocode non-blank addresses datadir = './data/cleaned_data' country_files = os.listdir(datadir) ## fix this country_files = [f for f in country_files if 'NL' in f]