Ejemplo n.º 1
0
def geocode_address(address):
    # use the address and convert it to coords
    data = dstk.DSTK({'apiBase': 'http://localhost:8080/'})
    data.api_base = 'http://localhost:8080/'
    try:
        good_address = data.street2coordinates(address)
        # usage would be good_address[address]['confidence']
        # returns the following fields in the dict:
        #       __len__         12
        #       confidence         0.878
        #       country_code         u'US'
        #       country_code3         u'USA'
        #       country_name         u'United States'
        #       fips_county         u'25015'
        #       latitude         42.37572
        #       locality         u'Amherst'
        #       longitude         -72.509542
        #       region               u'MA'
        #       street_address         u'534 Main St'
        #       street_name         u'Main St'
        #       street_number         u'534'
        if good_address[address] != None:
            return good_address
        else:
            print("Address was not valid:" + address)
            return "INVALID_ADDRESS"
    except:
        print("Address was not valid:" + address)
        return "INVALID_ADDRESS"
Ejemplo n.º 2
0
 def __init__(self, suffixes=None, cities=None, streets=None, zips=None, logger=None, backend="default",\
              dstk_api_base=None, required_confidence=0.65):
     """
     suffixes, cities and streets provide a chance to use different lists than the provided lists.
     suffixes is probably good for most users, unless you have some suffixes not recognized by USPS.
     cities is a very expansive list that may lead to false positives in some cases. If you only have a few cities
     you know will show up, provide your own list for better accuracy. If you are doing addresses across the US,
     the provided list is probably better.
     streets can be used to limit the list of possible streets the address are on. It comes blank by default and
     uses positional clues instead. If you are instead just doing a couple cities, a list of all possible streets
     will decrease incorrect street names.
     Valid backends include "default" and "dstk". If backend is dstk, it requires a dstk_api_base. Example of
     dstk_api_base would be 'http://example.com'.
     """
     super(AddressParser, self).__init__(suffixes, cities, streets, zips,
                                         logger)
     self.dstk_api_base = dstk_api_base
     self.required_confidence = required_confidence
     if backend == "dstk":
         if dstk_api_base is None:
             raise ValueError("dstk_api_base is required for dstk backend.")
         self.dstk = dstk.DSTK({'apiBase': dstk_api_base})
     elif backend == "default":
         pass
     else:
         raise ValueError("backend must be either 'default' or 'dstk'.")
Ejemplo n.º 3
0
    def __init__(self, options, address_gen):
        """Batch geocode addresses using DSTK.

        :param url: URL to a DTSK server
        :param address_gen: A generator that yields tuples of (address, object), where address is an address string.
            The address is geocoded, and the object is passed thorugh to the result.
        :return:

        """
        import dstk

        if isinstance(options, basestring):
            # Single string, not an options dict
            options = {'apiBase': options}

        self.gen = address_gen

        self.dstk_client = dstk.DSTK(options)
Ejemplo n.º 4
0
def get_comments():
    r = praw.Reddit('Whiteknight scrapping reddit for nasty comments'
                    'Url: https://github.com/whiteknightinc/white-knight')

    top_posts = r.get_subreddit('whiteknighttest').get_top(limit=10)
    comments_with_keywords = []
    toolkit = dstk.DSTK()
    # f = open("swearWordsValue.txt")
    # keywords = {}
    # for line in f:
    #     word, val = line.rstrip().split(",")
    #     keywords[word] = int(val)
    # f.close()

    for top_post in top_posts:
        submission = r.get_submission(submission_id=top_post.id)
        submission.replace_more_comments(limit=32, threshold=0)
        all_comments = submission.comments
        comments = praw.helpers.flatten_tree(all_comments)

        for comment in comments:
            words = comment.body
            response = toolkit.text2sentiment(words.encode('utf-8'))
            if response['score'] <= -3:
                print comment.body
                comments_with_keywords.append(comment)
            # for keyword in keywords.keys():
            #     count = words.count(keyword)
            #     if count > 0:
            #         score += count * (keywords.get(keyword))
            #         if score >= 10:
            #             comments_with_keywords.append(comment)
            #             break

    result = {}
    for num in range(len(comments_with_keywords)):
        result[num] = {}
        result[num]['text'] = comments_with_keywords[num].body
        result[num]['user'] = comments_with_keywords[num].author.name
        result[num]['permalink'] = comments_with_keywords[num].permalink
    return result
Ejemplo n.º 5
0
#!/usr/bin/env python
#
# A test case to reproduce https://github.com/petewarden/dstk/issues/4
#
# Calls the street2coordinates API repeatedly until it fails

import dstk

dstk = dstk.DSTK()

counter = 0
while True:

    test_input = '2543 Graystone Place, Simi Valley, CA 93065'

    result = dstk.street2coordinates(test_input)

    counter += 1
    print str(counter)
Ejemplo n.º 6
0
class Article(Base):
    '''Stores received and computed article data.'''
    __tablename__ = 'articles'

    id = Column(Integer, primary_key=True)
    places = Column(TextPickleType(pickler=json))
    sentiment = Column(Integer)
    last_referenced = Column(DateTime, default=datetime.datetime.now)

    dstk = dstk.DSTK()
   
    def __init__(self):
        self.sentiment = 0

    def extract(self, allowance):
        '''Check to see if the article is in the database.
           If it is, get the old data. If it's not, then 
           extract the data and decrement the allowance.'''
        
        query = session.query(Article).get(self.id)
        
        if (query != None):
            self.places = query.places
            self.sentiment = query.sentiment
            query.last_referenced = datetime.datetime.now()
            return "Cached"
        else:
            if (allowance > 0):
                print "Running extraction =>",
                # response=urllib2.urlopen(self.source)
                # html=response.read()
                # target=self.dstk.html2story(html)
                # target=target['story']

                apiSummary=self.title + ' ' + self.summary 
                # target = apiSummary+ ' ' + target
                target = apiSummary

                target = target.encode('ascii', 'ignore')
                apiSummary = apiSummary.encode('ascii', 'ignore')

                #replace U.S. with United States!!!
                target=target.replace ("U.S.", "United States")
                target=target.replace ("U.S.A", "United States")
                target=target.replace ("America", "United States")
                target=target.replace ("Obama", "United States")

                target=target.replace ("U.K.", "England")
                target=target.replace ("Britain", "England")
                target=target.replace ("England", "England")
                target=target.replace ("London", "England")

                target=target.replace ("Kim Jong-Un", "Democratic Republic of Korea")

                self.places = self.dstk.text2places(target)
                self.sentiment = int(self.dstk.text2sentiment(apiSummary)['score'])
            
                session.add(self)
                
                print "Done."
                return "Extracted"
            else:
                return "Remove"
        
    def to_json(self):
        a = self
        a.countries = []
        a.long= ""
        a.lat= ""
        for place in a.places:
            if place['type'] == "COUNTRY":
                try:
                    a.countries.append(two2three[place['code']])
                except KeyError, e:
                    # The EU is not a country.
                    pass
        
        # Remove duplicates
        a.countries = list(set(a.countries))
                
        # print place['longitude']
        # a.long +=place['longitude'] + ","
        # a.lat +=place['latitude'] + ","
        
        return {'aid':a.id, 'title':a.title, 'summary':a.summary, 'sentiment':a.sentiment, 'link':a.source, 'countries':a.countries, 'long':a.long, 'lat':a.lat, 'source':a.trueSource}
Ejemplo n.º 7
0
 def __init__(self, *args, **kwargs):
     super(Command, self).__init__(*args, **kwargs)
     self.r = praw.Reddit('user-agent')
     self.sub = self.r.get_subreddit('earthporn')
     self.dstk = dstk.DSTK()
Ejemplo n.º 8
0
import dstk

dstk = dstk.DSTK({'checkVersion': False})

print dstk.street2coordinates('2543 Graystone Place, Simi Valley, CA 93065')import dstk

dstk = dstk.DSTK({'checkVersion': False})

print dstk.street2coordinates('2543 Graystone Place, Simi Valley, CA 93065')
Ejemplo n.º 9
0
## Get console output:
reservations = ec2.get_all_instances()
instances = [i for r in reservations for i in r.instances]
this_instance = instances[-1]

time.sleep(180)

for r in ec2.get_all_instances():
    if r.id == reservation.id:
        break
this_instance = r.instances[0]
dns_name = this_instance.public_dns_name
base_url = "http://" + dns_name  #+ "/maps/api/geocode/json?sensor=false&address="

## Instantiate the dstk instance
dstk_endpoint = dstk.DSTK({'apiBase': base_url})

## Load the iso data
iso_codes = pd.read_csv(
    './data/iso_country_code_names.txt',
    sep=';',
    names=['country_name', 'country_code'],
    na_values=[],
)
iso_codes['country_code'][iso_codes['country_name'] == 'NAMIBIA'] = 'NA'

## Walk across the files and geocode non-blank addresses
datadir = './data/cleaned_data'
country_files = os.listdir(datadir)  ## fix this
country_files = [f for f in country_files if 'NL' in f]