Ejemplo n.º 1
0
 def test_geocode_pass(self):
   code,(lat,lon,text) =  geocoder.geocode('Audubon Park')
   self.assertEqual(code,'OK')
   code,(lat,lon,text) =  geocoder.geocode('Superdome')
   self.assertEqual(code,'OK')
   code,(lat,lon,text) =  geocoder.geocode('643 Magazine Street')
   self.assertEqual(code,'OK')
Ejemplo n.º 2
0
def match_phrase(phrase, verbose=False):
    # First step: attempt to map nicknames to standard place names
    if (phrase in nicknames):
        if verbose:
            print "Resolved nickname \"%s\" as \"%s\": " % (repr(phrase), repr(nicknames[phrase]))
        phrase = nicknames[phrase]

    # Second step: attempt to match mapped string in Nominatim
    try:
        G = geocoder.geocode(phrase,site='osm')
        if G['countrycode']:
            return G
    except ValueError:
        pass

    # Third step: attempt to run query, removing any (parenthesized) expressions
    paren_regex = "\(.+?\)"
    noparens = re.sub(paren_regex,'',phrase)

    if noparens != phrase:
        if verbose:
            print "* Removing parenthetical expression: \"%s\" -> \"%s\"" % (repr(noparens), repr(phrase))
        try:
            G = geocoder.geocode(noparens, site='osm')
            if G['countrycode']:
                if verbose:
                    print "** Resolved \"%s\" as \"%s\"" % (repr(phrase), repr(noparens))
                return G
        except ValueError:
            pass    

    return G
Ejemplo n.º 3
0
def main():
    p = optparse.OptionParser()
    p.add_option('-x', help='Specify number of random names to generate.', type='int', dest='times', default=None)
    p.add_option('-n', help='Specify filename that contains names.', dest='names')
    p.add_option('-a', help='Specify filename that contains addresses.', dest='addresses')
    (opts, args) = p.parse_args()

    generatenames(opts.times, opts.names, opts.addresses)
    print '=' * 40
    print 'Generating Geocodes.'
    print '=' * 40
    geocode('geocodes.csv')

    print """
Ejemplo n.º 4
0
 def test_geocode_fail(self):
   code,(lat,lon,text) =  geocoder.geocode('Audubon Park')
   self.assertEqual(code,'OK')
   code,(lat,lon,text) =  geocoder.geocode('Superdome')
   self.assertEqual(code,'OK')
   code,(lat,lon,text) =  geocoder.geocode('643 Magazine Street')
   self.assertEqual(code,'OK')
   # not specific
   code,result =  geocoder.geocode('New Orleans, LA')
   self.assertNotEqual(code,'OK')
   self.assertIsNone(result)
   # ambiguous results
   code,result =  geocoder.geocode('Starbucks')
   self.assertNotEquals(code,'OK')
   self.assertIsNone(result)
Ejemplo n.º 5
0
def add_ad():
    form = AdForm()
    db_sess = db_session.create_session()
    categories = db_sess.query(Categories).all()

    if form.submit.data:
        from random import choice
        n = ''
        for _ in range(100):
            n += choice('1234567890')
        if geocode(form.address.data):
            ad = Ad(
                photo=n,
                address=form.address.data,
                name=form.name.data,
                category=db_sess.query(Categories).filter(
                    Categories.id == form.category.data).first().title,
                description=form.description.data,
                number=form.number.data,
                user_id=db_sess.query(User).filter(
                    User.login == flask_login.current_user.login).first().id)
        else:
            return render_template('add_ad.html',
                                   form=form,
                                   cate=categories,
                                   m="нет такого города")

        db_sess.add(ad)
        db_sess.commit()

        return redirect(f'/add_photo/{ad.id}/{n}')
    return render_template('add_ad.html', form=form, cate=categories)
Ejemplo n.º 6
0
def trySearch(line, place, woetype):
    woeTypes = [woetype]

    # town, admin3, suburb
    townWoeTypes = ['7', '10', '22']
    if woetype in townWoeTypes:
        woeTypes = townWoeTypes

    try:
        g = geocoder.geocode(place, {
            'woeRestrict': ','.join(woeTypes),
            'allowedSources': 'geonameid'
        })
        if g and g.geonameid() and g.isFull():
            return GeocodeSuccess(u'\t'.join([
                unicode(g.geonameid()),
                unicode(g.woeType()),
                unicode(g.lat()),
                unicode(g.lng()),
                g.matchedName(),
                line.decode('utf-8')
            ]))
        else:
            return GeocodeFailure(line.decode('utf-8'))
    except:
        print 'timeout'
        return GeocodeTimeout(line.decode('utf-8'))
Ejemplo n.º 7
0
 def SHOw(self):
     adres, ok_pressed = QInputDialog.getText(
         self, "Адрес", 'введите адрес: (Москва Гурьянова 2)')
     if ok_pressed:
         if adres:
             try:
                 self.coords_pt = list(get_coordinates(adres))
             except:
                 pass
             if self.coords_pt:
                 self.adress = geocode(adres)['metaDataProperty'][
                     'GeocoderMetaData']['Address']['formatted']
                 self.adressPt.setText(
                     geocode(adres)['metaDataProperty']['GeocoderMetaData']
                     ['Address']['formatted'])
                 self.map.setPixmap(self.get_map())
Ejemplo n.º 8
0
def match_gps(locstr, verbose=False):
    suffix = locstr.split(':')[-1]
    (lat,lon) = [float(x) for x in suffix.strip('( )').split(',')]
    reverse = True
    query = "%s,%s" % (lat, lon)
    if verbose:
        print "Trying to match string \"%s\" as lat/lon..." % repr(query)
    return geocoder.geocode(query,site='osm')
        
Ejemplo n.º 9
0
  def GET(self):
    if 'mode' not in web.input():
      return render.app(timemode='now',time=datetime.today().strftime("%H:%M"))

    tvars = dict(web.input())
    tvars['error'] = None

    fromplace = getattr(web.input(),'from')
    toplace = web.input().to
    if not fromplace or not toplace:
      tvars['error'] = 'Please enter an address or landmark for From and To'
      return render.app(**tvars)

    from_result,fromgeo = geocoder.geocode(fromplace)
    if from_result != 'OK':
      tvars['error'] = 'Unable to find address for %s' % fromplace
      return render.app(**tvars)
    tvars['fromgeo'] = fromgeo

    to_result,togeo = geocoder.geocode(toplace)
    if to_result != 'OK':
      tvars['error'] = 'Unable to find address for %s' % toplace
      return render.app(**tvars)
    tvars['togeo'] = togeo

    timemode = web.input().get('timemode')
    if timemode == 'now':
      result = otp.plan(fromgeo[0:2],togeo[0:2],web.input().mode)
    else:
      try:
        time = dateparser.parse_time(web.input().time)
      except ValueError:
        tvars['error'] = "Invalid time format"
        return render.app(**tvars)

      result = otp.plan(fromgeo[0:2],togeo[0:2],web.input().mode,time,timemode)
    if 'plan' in result:
      tvars['result'] = result
    else:
      # no itinerary found - rare but possible
      tvars['error'] = result['error']['msg']
    return render.app(**tvars)
Ejemplo n.º 10
0
def directions(dirfrom,dirto,mode,at,atmode):
  if not mode:
    mode = 'ANY'
  from_result,from_place = geocode(dirfrom)
  if from_result != 'OK':
    return error_for_geocode(from_result,dirfrom)
  to_result,to_place = geocode(dirto)
  if to_result != 'OK':
    return error_for_geocode(to_result,dirto)
  date = None
  datemode = None
  if at:
    date = dateparser.parse_time(at)
    if atmode == 'arrive':
      datemode = 'arrive'
    else:
      datemode = 'depart'
    print "date chosen %s,mode=%s" % (date,datemode)
  plan = otp.plan(from_place[0:2],to_place[0:2],mode.upper(),date,datemode)
  return plan_instructions(plan)
Ejemplo n.º 11
0
 def set_location(self):
     logging.info("Set Location for Address: %s" % self.address)
     if self.address:
         r = geocoder.geocode(self.address)
         if not r:
             raise "Cannot find geolocation for address: %s" %s
         pts = r.split(",")
         logging.info("Geocode Lookup Results: %s" % pts)
         if pts and len(pts) >= 2:
             self.location = db.GeoPt(pts[2], pts[3])
             logging.info("Geo Point: %s" % self.location)
             self.update_location()
def trySearchHelper(line, place, woeTypes):
  try:
    g = geocoder.geocode(place, {
      'woeRestrict': ','.join(woeTypes),
      'allowedSources': 'geonameid'
    })
    if g and g.geonameid() and g.isFull():
      return GeocodeSuccess(u'\t'.join([unicode(g.geonameid()), unicode(g.woeType()), unicode(g.lat()), unicode(g.lng()), g.matchedName(), line.decode('utf-8')]))
    else:
      return GeocodeFailure(line.decode('utf-8'))
  except:
      traceback.print_exc()
      print 'timeout'
      return GeocodeTimeout(line.decode('utf-8'))
Ejemplo n.º 13
0
 def indPt(self):
     self.flag = not self.flag
     try:
         ind = geocode(
             self.adress
         )['metaDataProperty']['GeocoderMetaData']['Address']['postal_code']
     except:
         ind = '(Нет почтового индекса)'
     if self.flag:
         if self.adressPt.text() == self.adress:
             self.adressPt.setText(f'{self.adressPt.text()} {ind}')
     else:
         if self.adressPt.text() == f'{self.adress} {ind}':
             self.adressPt.setText(self.adress)
def trySearch(line, place, woetype):
  woeTypes = [woetype]

  # town, admin3, suburb
  townWoeTypes = ['7', '10', '22']
  if woetype in townWoeTypes:
    woeTypes = townWoeTypes

  try:
    g = geocoder.geocode(place, {
      'woeRestrict': ','.join(woeTypes),
      'allowedSources': 'geonameid'
    })
    if g and g.geonameid() and g.isFull():
      return GeocodeSuccess(u'\t'.join([unicode(g.geonameid()), unicode(g.woeType()), unicode(g.lat()), unicode(g.lng()), g.matchedName(), line.decode('utf-8')]))
    else:
      return GeocodeFailure(line.decode('utf-8'))
  except:
      print 'timeout'
      return GeocodeTimeout(line.decode('utf-8'))
Ejemplo n.º 15
0

size = width, height = 650, 500
screen = pygame.display.set_mode(size)
pygame.init()
toponym_to_find = get_toponym()
ll, spn = geocoder.get_ll_span(toponym_to_find)
finded_place = ll + "," + "pmgnm"
q = 0
mapp = button(q)
button2()
button3((255, 0, 0), 't')
u = 0
get_image(ll, spn, mapp, finded_place)
while 1:
    address_to_out = geocoder.geocode(toponym_to_find)['metaDataProperty']['GeocoderMetaData']['text']
    for event in pygame.event.get():
        if event.type == pygame.QUIT:
            pygame.quit()
            raise SystemExit
        elif event.type == pygame.MOUSEBUTTONDOWN:
            x, y = event.pos
            if y < 50 and x < 300:
                q += 1
                mapp = button(q)
                get_image(ll, spn, mapp, finded_place)
            if y < 50 and x > 300 and x < 600:
                toponym_to_find = get_toponym()
                ll, spn = geocoder.get_ll_span(toponym_to_find)
                finded_place = ll + "," + "pmgnm"
                q = 0
Ejemplo n.º 16
0
def main():
    args = parser.parse_args()
    config = configparser.ConfigParser()
    config.optionxform=str # to preserve case
    config.read(args.config_file) 
    logging.basicConfig(
        format='%(asctime)s %(filename)s:%(lineno)d %(message)s',
        filename='cleanup.log', filemode="w", level=logging.INFO)
    
    sheets = get_GoogleSheets(config)
    for_github = []

    # Load geocoder early so that invalid tsv paths errors are caught early on.
    geocoder = csv_geocoder.CSVGeocoder(
        config['GEOCODING'].get('TSV_PATH'),
        arcgis)
    for s in sheets:
        logging.info("Processing sheet %s", s.name)

        ### Clean Private Sheet Entries. ###
        # note : private sheet gets updated on the fly and redownloaded to ensure continuity between fixes (granted its slower).
        
        range_ = f'{s.name}!A:AG'
        data = values2dataframe(s.read_values(range_))

         # Expand aggregated cases into one row each.
        logging.info("Rows before expansion: %d", len(data))
        if len(data) > 150000:
            logging.warning("Sheet %s has more than 150K rows, it should be split soon", s.name)
        data.aggregated_num_cases = pd.to_numeric(data.aggregated_num_cases, errors='coerce')
        data = duplicate_rows_per_column(data, "aggregated_num_cases")
        logging.info("Rows after expansion: %d", len(data))

        # Generate IDs for each row sequentially following the sheet_id-inc_int pattern.
        data['ID'] = s.base_id + "-" + pd.Series(range(1, len(data)+1)).astype(str)

        # Remove whitespace.
        data = trim_df(data)

        # Fix columns that can be fixed easily.
        data.sex = fix_sex(data.sex)

        # fix N/A => NA
        for col in data.select_dtypes("string"):
            data[col] = fix_na(data[col])

        # Regex fixes
        fixable, non_fixable = generate_error_tables(data)
        if len(fixable) > 0:
            logging.info('fixing %d regexps', len(fixable))
            s.fix_cells(fixable)
            data = values2dataframe(s.read_values(range_))
        
        # ~ negates, here clean = data with IDs not in non_fixable IDs.
        clean = data[~data.ID.isin(non_fixable.ID)]
        clean = clean.drop('row', axis=1)
        clean.sort_values(by='ID')
        s.data = clean
        non_fixable = non_fixable.sort_values(by='ID')

        # Save error_reports
        # These are separated by Sheet.
        logging.info('Saving error reports')
        directory   = config['FILES']['ERRORS']
        file_name   = f'{s.name}.error-report.csv'
        error_file  = os.path.join(directory, file_name)
        non_fixable.to_csv(error_file, index=False, header=True, encoding="utf-8")
        for_github.append(error_file)
        
    # Combine data from all sheets into a single datafile
    all_data = []
    for s in sheets:
        logging.info("sheet %s had %d rows", s.name, len(s.data))
        all_data.append(s.data)
    
    all_data = pd.concat(all_data, ignore_index=True)
    all_data = all_data.sort_values(by='ID')
    logging.info("all_data has %d rows", len(all_data))

    # Fill geo columns.
    geocode_matched = 0
    for i, row in all_data.iterrows():
        geocode = geocoder.geocode(row.city, row.province, row.country)
        if not geocode:
            continue
        geocode_matched += 1
        all_data.at[i, 'latitude'] = geocode.lat
        all_data.at[i, 'longitude'] = geocode.lng
        all_data.at[i, 'geo_resolution'] = geocode.geo_resolution
        all_data.at[i, 'location'] = geocode.location
        all_data.at[i, 'admin3'] = geocode.admin3
        all_data.at[i, 'admin2'] = geocode.admin2
        all_data.at[i, 'admin1'] = geocode.admin1
        all_data.at[i, 'admin_id'] = geocode.admin_id
        all_data.at[i, 'country_new'] = geocode.country_new
    logging.info("Geocode matched %d/%d", geocode_matched, len(all_data))
    logging.info("Top 10 geocode misses: %s",geocoder.misses.most_common(10))
    with open("geocode_misses.csv", "w") as f:
        geocoder.write_misses_to_csv(f)
        logging.info("Wrote all geocode misses to geocode_misses.csv")
    if len(geocoder.new_geocodes) > 0:
        logging.info("Appending new geocodes to geo_admin.tsv")
        with open(config['GEOCODING'].get('TSV_PATH'), "a") as f:
            geocoder.append_new_geocodes_to_init_file(f)
        for_github.append(config['GEOCODING'].get('TSV_PATH'))
    # Reorganize csv columns so that they are in the same order as when we
    # used to have those geolocation within the spreadsheet.
    # This is to avoid breaking latestdata.csv consumers.
    all_data = all_data[["ID","age","sex","city","province","country","latitude","longitude","geo_resolution","date_onset_symptoms","date_admission_hospital","date_confirmation","symptoms","lives_in_Wuhan","travel_history_dates","travel_history_location","reported_market_exposure","additional_information","chronic_disease_binary","chronic_disease","source","sequence_available","outcome","date_death_or_discharge","notes_for_discussion","location","admin3","admin2","admin1","country_new","admin_id","data_moderator_initials","travel_history_binary"]]

    # ensure new data is >= than the last one. 
    latest_name = os.path.join(config['FILES']['LATEST'], 'latestdata.csv')

    line_diff = len(all_data) - len(pd.read_csv(latest_name))
    if line_diff >= 0:
        logging.info(f"Line check passed, {line_diff} new lines")
    else:
        logging.info("Line check failed")
        return 

    # save
    logging.info("Saving files to disk")
    dt = datetime.now().strftime('%Y-%m-%dT%H%M%S')
    file_name   = config['FILES']['DATA'].replace('TIMESTAMP', dt)
    all_data.to_csv(file_name, index=False, encoding="utf-8")
    all_data.to_csv(latest_name, index=False, encoding="utf-8")
    logging.info("Wrote %s, %s", file_name, latest_name)

    if args.push_to_git:
        logging.info("Pushing to github")
        # Create script for uploading to github
        for_github.extend([file_name, latest_name])
        script  = 'set -e\n'
        script += 'cd {}\n'.format(config['GIT']['REPO'])
        script += 'git pull origin master\n'
        
        for g in for_github:
            script += f'git add {g}\n'
        script += 'git commit -m "data update"\n'
        script += 'git push origin master\n'
        script += f'cd {os.getcwd()}\n'
        print(script)
        os.system(script)
Ejemplo n.º 17
0
 def test_geocode_latlon(self):
   code,(lat,lon,text) =  geocoder.geocode('29.949803,-90.068858')
   self.assertEqual(code,'OK')
   self.assertAlmostEqual(lat,29.949803)
   self.assertAlmostEqual(lon,-90.068858)
Ejemplo n.º 18
0
def scrape(source="dcn",
           provided_url_key=False,
           limit=False,
           since="last_record",
           until="now",
           test=False):
    """Extracts new certificates by scraping CSP websites and writes data to the web_certificates table in the database.
    
    Parameters:
     - `source` (str): Specifies source webstie being scraped for CSP's. Can be either `dcn` for Daily Commercial News or `ocn` for Ontario Construction News.
     - `provided_url_key` (str of False): provided_url_key that is to be scraped. False by default.
     - `limit` (int): Specifies a limit for the amount of certificates to be scraped. Default is no limit.
     - `since` (str): Specifies date from when to begin looking for new CSP's. Can be either `last_record` or `yyyy-mm-dd` string format.
     - `until` (str): Specifies date for when to end the search for new CSP's. Can be either `now` or `yyyy-mm-dd` string format.
     - `test` (bool): Set to True to cancel writing to the database and return DataFrame of scraped certificates instead.

    Returns:
     - `True` if 1 or more certificates were scraped
     - `False` if no certificates were scraped
     - a Pandas DataFrame containing new certificates if Test=True

    """

    # Initialize string and lambda functions based on source :
    def get_details(entry):
        entry = base_url + entry
        url_key = entry.split(base_aug_url)[1]
        while True:
            try:
                response = requests.get(entry)
                break
            except requests.exceptions.ConnectionError:
                sleep(1)
                continue
        if response.status_code == 404:
            return
        html = response.content
        entry_soup = BeautifulSoup(html, "html.parser")
        if source == "dcn":
            pub_date = entry_soup.find("time").get_text()
            cert_type = entry_soup.find("h1").get_text()
            if cert_type == "Certificates and Notices":
                cert_type = (
                    "csp"
                )  # old style -> assume csp by default even if it might not be true
                city = (entry_soup.find("div", {
                    "class": "content-left"
                }).find("h4").get_text())
                address = entry_soup.find("p", {
                    "class": "print-visible"
                }).get_text()
                title = (entry_soup.find_all(
                    "section", {"class": "content"})[3].find("p").get_text())
            else:
                cert_type = ("csp" if cert_type
                             == "Certificate of Substantial Performance" else
                             cert_type)
                city = entry_soup.find_all("dl")[0].find("dt").get_text()
                address = entry_soup.find_all("dl")[1].find("dt").get_text()
                title = entry_soup.find_all("dl")[2].find("dd").get_text()
                if address.startswith(
                        "This is to certify"
                ):  # no address available. chnage sequence going forward
                    address = ""
                    title = entry_soup.find_all("dl")[1].find("dd").get_text()
            company_results = {
                key.get_text(): value.get_text()
                for key, value in zip(entry_soup.find_all("dt"),
                                      entry_soup.find_all("dd"))
            }
            owner = company_results.get(
                "Name of owner:", company_results.get("Name of Owner", np.nan))
            contractor = company_results.get(
                "Name of contractor:",
                company_results.get("Name of Contractor", np.nan))
            engineer = company_results.get(
                "Name of payment certifier:",
                company_results.get(
                    "Name of Certifier",
                    company_results.get("Name of certifier:", np.nan),
                ),
            )
        elif source == "ocn":
            if ("Non-Payment" in entry_soup.find("h1", {
                    "class": "entry-title"
            }).get_text()):
                cert_type = "np"
            else:
                try:
                    header = entry_soup.find("h2", {
                        "class": "ocn-heading"
                    }).find_next_sibling("p").get_text()
                except AttributeError:
                    header = ' '
                if "Notice of Termination" in header:
                    cert_type = "term"
                else:
                    cert_type = "csp"
            pub_date = str(
                dateutil.parser.parse(
                    entry_soup.find("date").get_text()).date())
            try:
                city = entry_soup.find("h2", {
                    "class": "ocn-subheading"
                }).get_text().split(":")[0]
            except AttributeError:
                city = ''
            if cert_type == "csp":
                address = (entry_soup.find("div", {
                    "class": "ocn-certificate"
                }).find("p").get_text())
                try:
                    title = (entry_soup.find("h2", {
                        "class": "ocn-heading"
                    }).find_next_sibling("p").get_text())
                except AttributeError:
                    title = ''
                company_soup = entry_soup.find(
                    "div", {"class": "ocn-participant-wrap"})
                company_results = {
                    key.get_text(): value.get_text()
                    for key, value in zip(
                        company_soup.find_all(
                            "div", {"class": "participant-type"})[::2],
                        company_soup.find_all(
                            "div", {"class": "participant-name-wrap"}),
                    )
                }
                owner = company_results.get("Name of Owner", np.nan)
                contractor = company_results.get("Name of Contractor", np.nan)
                engineer = company_results.get("Name of Payment Certifier",
                                               np.nan)
            elif cert_type == "np":
                address = (entry_soup.find("h4", {
                    "class": "ocn-subheading"
                }).find_next("p").get_text())
                title = address  # temporary until we see more of these
                for x in entry_soup.find_all("strong"):
                    try:
                        if x.get_text() == "Name of owner:":
                            owner = x.find_parent().get_text().split(": ")[1]
                        if x.get_text() == "Name of contractor:":
                            contractor = x.find_parent().get_text().split(
                                ": ")[1]
                    except AttributeError:
                        pass
                engineer = np.nan
            elif cert_type == "term":
                address = (entry_soup.find("h1", {
                    "class": "entry-title"
                }).get_text())
                title = address  # temporary until we see more of these
                for x in entry_soup.find_all("strong"):
                    try:
                        if x.get_text() == "Name of owner:":
                            owner = x.find_parent().get_text().split(": ")[1]
                        if x.get_text() == "Name of contractor:":
                            contractor = x.find_parent().get_text().split(
                                ": ")[1]
                    except AttributeError:
                        pass
                engineer = np.nan
        elif source == "l2b":
            cert_type_text = entry_soup.find("h2").get_text()
            #cert_type = ("csp" if "Form 9" in cert_type_text else cert_type_text)
            if "Form 9" in cert_type_text:
                cert_type = "csp"
            elif "Form 10" in cert_type_text:
                cert_type = "ccs"
            else:
                cert_type = cert_type_text
            attr_pairs = {}
            fields = entry_soup.find_all('p', {'class': 'mb-25'})
            for field in fields:
                try:
                    attr_pair = [
                        s for s in re.findall('[^\t^\n^\r]*', field.get_text())
                        if s
                    ]
                    attr_pairs.update({attr_pair[0]: attr_pair[1]})
                except IndexError:
                    pass
            retry_count = 0
            while True:
                try:
                    response = requests.get(base_url)
                    break
                except requests.exceptions.ConnectionError:
                    logger.info(
                        f"L2B not responding again ({retry_count}). waiting 2 seconds and retrying..."
                    )
                    retry_count += 1
                    sleep(2)
            html = response.content
            soup = BeautifulSoup(html, "html.parser")
            pub_date = [
                str(parse_date(entry.find_all('td')[1].get_text()).date())
                for entry in soup.find('tbody').find_all('tr')
                if url_key in str(entry)
            ][0]
            if cert_type == 'ccs':
                city = attr_pairs.get('Of premises at', np.nan)
                address = attr_pairs.get('Of premises at', np.nan)
                title = ' '.join((attr_pairs.get(
                    'The subcontract provided for the supply of the following services or materials',
                    ''), attr_pairs.get('To the following improvement', '')))
                title = np.nan if title in ('', ' ') else title
            else:
                city = attr_pairs.get('Where the Premises is Situated', np.nan)
                address = attr_pairs.get('Where the Premises is Located',
                                         np.nan)
                title = attr_pairs.get(
                    'This is to certify that the contract for the following improvement',
                    np.nan)
            owner = attr_pairs.get('Name of Owner', np.nan)
            contractor = attr_pairs.get('Name of Contractor', np.nan)
            engineer = attr_pairs.get('Name of Payment Certifier', np.nan)
        return (
            pub_date,
            city,
            address,
            title,
            owner,
            contractor,
            engineer,
            url_key,
            cert_type,
            source,
        )

    pub_date, city, address, title, owner, contractor, engineer, url_key, cert_type = [
        [] for _ in range(9)
    ]
    if until == "now":
        until = datetime.datetime.now().date()
    else:
        try:
            until = re.findall("\d{4}-\d{2}-\d{2}", until)[0]
        except KeyError:
            raise ValueError(
                "`until` parameter should be in the format yyyy-mm-dd if not a key_word"
            )
    if since == "last_record":
        hist_query = """
            SELECT pub_date 
            FROM web_certificates 
            WHERE source=%s
            ORDER BY pub_date DESC LIMIT 1
        """
        with create_connection() as conn:
            cur = conn.cursor()
            cur.execute(hist_query, [source])
            last_date = cur.fetchone()[0]
            ld_year = int(last_date[:4])
            ld_month = int(last_date[5:7])
            ld_day = int(last_date[8:])
            since = datetime.datetime(ld_year, ld_month, ld_day).date()
    else:
        valid_since_date = re.search("\d{4}-\d{2}-\d{2}", since)
        if not valid_since_date:
            raise ValueError(
                "`since` parameter should be in the format yyyy-mm-dd if not a "
                "predefined term.")
    if source == "dcn":
        base_url = "https://canada.constructconnect.com"
        base_aug_url = (
            "https://canada.constructconnect.com/dcn/certificates-and-notices/"
        )
        base_search_url = "https://canada.constructconnect.com/dcn/certificates-and-\
                notices?perpage=1000&phrase=&sort=publish_date&owner=&contractor="

        custom_param_url = "&date=custom&date_from={}&date_to={}#results"
        get_number_of_matches = lambda soup: int(
            re.compile("\d\d*").findall(
                (soup.find("span", {
                    "class": "search-results__total"
                }).get_text()))[0])
        get_entries = lambda soup: [
            x.find("a").get("href")
            for x in soup.find_all("article", {"class": "cards-item"})
        ]
    elif source == "ocn":
        base_url = ""
        base_aug_url = "https://ontarioconstructionnews.com/certificates/"
        base_search_url = "https://ontarioconstructionnews.com/certificates/?\
            per_page=1000&certificates_page=1&search=&form_id=&owner_name_like\
                =&contractor_name_like="

        custom_param_url = (
            "&date_published=custom&date_published_from={}&date_published_to={}"
        )
        get_number_of_matches = lambda soup: int((soup.find_all(
            "span", {"class": "items-found"})[1].get_text().split(" of ")[1]))
        get_entries = lambda soup: [
            x.find("a").get("href")
            for x in soup.find_all("td", {"class": "col-location"})
        ]
    elif source == "l2b":
        base_url = "https://certificates.link2build.ca/"
        base_aug_url = "Search/Detail/"
        base_search_url = "https://certificates.link2build.ca/"
        custom_param_url = ""
        since = str(since)
        until = str(until)
        get_entries = lambda soup: [
            entry.find('a').get('href')
            for entry in soup.find('tbody').find_all('tr')
            if parse_date(since) <= parse_date(
                entry.find_all('td')[1].get_text()) <= parse_date(until)
        ]
        get_number_of_matches = lambda soup: len(get_entries(soup))
    else:
        raise ValueError("Must specify CSP source.")
    if provided_url_key:
        details = get_details(provided_url_key)
        return pd.DataFrame(
            data={
                "pub_date": details[0],
                "city": details[1],
                "address": details[2],
                "title": details[3],
                "owner": details[4],
                "contractor": details[5],
                "engineer": details[6],
                "url_key": details[7],
                "cert_type": details[8],
                "source": [source] * len(details[0]),
            })
    date_param_url = custom_param_url.format(since, until)
    response = requests.get(base_search_url + date_param_url)
    html = response.content
    soup = BeautifulSoup(html, "html.parser")
    number_of_matches = get_number_of_matches(soup)
    if not number_of_matches:
        logger.info(
            "Nothing new to scrape in timeframe specified - exiting scrape function."
        )
        return False  # signaling that scrape returned nothing
    logger.info(
        f"scraping all of {number_of_matches} new certificates since {since}..."
    )
    bar = progressbar.ProgressBar(
        maxval=number_of_matches + 1,
        widgets=[
            progressbar.Bar("=", "[", "]"), " ",
            progressbar.Percentage()
        ],
    )
    bar.start()
    logged_key_query = """
        SELECT url_key 
        FROM web_certificates 
        WHERE source=%s
    """
    with create_connection() as conn:
        logged_url_keys = list(
            pd.read_sql(logged_key_query, conn, params=[source]).url_key)
    entries = get_entries(soup)
    for i, entry in enumerate(entries, 1):
        check_url_key = (base_url + entry).split(base_aug_url)[1]
        if not test and check_url_key in logged_url_keys:
            logger.info(
                f"entry for {check_url_key} was already logged - continuing with the next one (if any)..."
            )
            continue
        details = get_details(entry)
        # print(entry)
        if not details:
            logger.info(
                f"entry for {check_url_key} was a 404 page - continuing with the next one (if any)..."
            )
            continue
        for cumulative, item in zip(
            [
                pub_date,
                city,
                address,
                title,
                owner,
                contractor,
                engineer,
                url_key,
                cert_type,
            ],
                details,
        ):
            cumulative.append(item)
        if limit and (i >= limit):
            logger.info("limit reached - breaking out of loop.")
            break
        bar.update(i + 1)
    bar.finish()
    with create_connection() as conn:
        last_cert_id = (pd.read_sql(
            "SELECT * from web_certificates ORDER BY cert_id DESC LIMIT 1",
            conn).iloc[0].cert_id)
    df_web = pd.DataFrame(
        data={
            "pub_date": pub_date,
            "city": city,
            "address": address,
            "title": title,
            "owner": owner,
            "contractor": contractor,
            "engineer": engineer,
            "url_key": url_key,
            "cert_type": cert_type,
            "source": [source] * len(pub_date),
        })
    if not len(df_web):
        return False
    df_web = df_web.sort_values("pub_date", ascending=True)
    df_web["cert_id"] = [
        int(x) for x in range(last_cert_id + 1, last_cert_id + 1 + len(df_web))
    ]
    # make date into actual datetime object
    df_web["pub_date"] = df_web.pub_date.apply(lambda x: str(
        parse_date(str(x)).date()) if (x and str(x) != 'nan') else np.nan)
    logger.info("Fetching geocode information...")
    df_web = geocode(df_web)
    if test:
        return df_web
    attrs = [
        "cert_id",
        "pub_date",
        "city",
        "address_lat",
        "address_lng",
        "city_lat",
        "city_lng",
        "city_size",
        "address",
        "title",
        "owner",
        "contractor",
        "engineer",
        "url_key",
        "cert_type",
        "source",
    ]
    query = f""" 
        INSERT INTO web_certificates 
        ({', '.join(attrs)}) VALUES ({','.join(['%s']*len(attrs))})
    """
    new_certs = [[row[attr] for attr in attrs] for _, row in df_web.iterrows()]
    with create_connection() as conn:
        conn.cursor().executemany(query, new_certs)
        conn.commit()
    return True  # signaling that something scrape did return some results
Ejemplo n.º 19
0
#!/usr/bin/python
# -*- coding: utf-8 -*-
##################Query the longitude and latitude for########################
###################all departments and regions in ivory coast##################
###############################################################################
from geocoder import geocode

infile = open("cities_departmentID.txt", "r")
all_lines = infile.readlines()
infile.close()

place_geo = {}
for line in all_lines:
    query = line.split()[-1]
    if query != 'NA' and query not in place_geo.keys():
        geo = geocode(query, site='bing')
        lon = str(geo['longitude'])
        lat = str(geo['latitude'])
        place_geo[query] = lon + ',' + lat
        print query, geo

write_lines = ['department\tlongitude\tlatitude\n']
for place, geo in place_geo.items():
    lon, lat = geo.split(',')
    newline = place + '\t' + lon + '\t' + lat + '\n'
    write_lines.append(newline)

outfile = open('ivc_dept_geo1.txt', 'w')
outfile.writelines(write_lines)
outfile.close()
from argparse import ArgumentParser
from geocoder import geocode
import json


if __name__ == '__main__':
    parser = ArgumentParser(description='Use Mapzen to geocode a location')

    parser.add_argument('location', type=str,
                           help='A human-readable description of your location/address')

    parser.add_argument('api_key', type=str, help='Your Mapzen API key')

    args = parser.parse_args()

    mapzen_result = geocode(api_key=args.api_key, location_name=args.location)

    if not mapzen_result:
        print("Sorry, could not geocode the location:", args.location)
    else:
        # print dictionary as a prettified JSON
        txt = json.dumps(mapzen_result, indent=2)
        print(txt)
Ejemplo n.º 21
0
    map_file = "map.png"
    with open(map_file, "wb") as file:
        file.write(response.content)

    for event in pygame.event.get():
        if event.type == pygame.QUIT:
            running = False
        if event.type == pygame.KEYDOWN:
            delta = (18 - int(parms['z'])) // 2
            if active:
                if event.key == pygame.K_RETURN:
                    active = False

                    parms['ll'] = str(get_coordinates(text)[0]) + ',' + str(get_coordinates(text)[1])
                    parms['pt'] = parms['ll'] + ',flag'
                    print(geocode(text))
                    text = geocode(text)['metaDataProperty']['GeocoderMetaData']['Address']['formatted']
                    index = f(
                        f"http://geocode-maps.yandex.ru/1.x/?apikey=40d1649f-0493-4b70-98ba-98533de7710b&geocode={text}&format=json")
                    text += index
                elif event.key == pygame.K_BACKSPACE:
                    text = text[:-1]
                else:

                    text += event.unicode
            if event.key == pygame.K_PAGEUP and int(parms['z']) < 22:
                parms['z'] = str(int(parms['z']) + 1)
            if event.key == pygame.K_PAGEDOWN and 0 < int(parms['z']):
                parms['z'] = str(int(parms['z']) - 1)
            if event.key == pygame.KMOD_SHIFT:
                if index not in text:
Ejemplo n.º 22
0
import csv

SOURCE_FILENAME = './static/data/AdultResidentialFacilities06052016.csv'
OUTPUT_FILENAME = 'static/data/geocoded_facilities.csv'

# Open the old data
with open(SOURCE_FILENAME, 'r') as f:
    facilities = []
    for row in csv.DictReader(f):
            facilities.append(row)

xcount = 0
# now we geocode
for f in facilities:
    xcount += 1
    address = f['Facility Address']
    city = f['Facility City']
    coordinates = geocode(address, city)
    f['latitude'] = coordinates['latitude']
    f['longitude'] = coordinates['longitude']
    print(xcount, address, city, coordinates)

print("Geocoding all done!")

the_headers = list(facilities[0].keys())

with open(OUTPUT_FILENAME, 'w') as wfile:
    c = csv.DictWriter(wfile, fieldnames=the_headers)
    c.writeheader()
    c.writerows(facilities)