def get_pmh_input_record(self, first, last): args = {} args['metadataPrefix'] = 'oai_dc' pmh_records = [] error = None my_sickle = self.get_my_sickle(self.pmh_url) logger.info(u"connected to sickle with {}".format(self.pmh_url)) args['from'] = first.isoformat()[0:10] if last: args["until"] = last.isoformat()[0:10] if self.pmh_set: args["set"] = self.pmh_set logger.info(u"calling ListRecords with {} {}".format( self.pmh_url, args)) try: pmh_records = my_sickle.ListRecords(ignore_deleted=True, **args) # logger.info(u"got pmh_records with {} {}".format(self.pmh_url, args)) pmh_input_record = self.safe_get_next_record(pmh_records) except NoRecordsMatch as e: logger.info(u"no records with {} {}".format(self.pmh_url, args)) pmh_input_record = None except Exception as e: logger.exception(u"error with {} {}".format(self.pmh_url, args)) pmh_input_record = None error = "error in get_pmh_input_record: {} {} calling {}".format( e.__class__.__name__, unicode(e.message).encode("utf-8"), my_sickle.get_http_response_url()) print error return (pmh_input_record, pmh_records, error)
def save_vote(): # Get the team and time the vote was cast. team = request.form["team"] time_cast = datetime.datetime.utcnow() # Verify that the team is one of the allowed options if team != "TABS" and team != "SPACES": logger.warning(team) return Response(response="Invalid team specified.", status=400) stmt = sqlalchemy.text( "INSERT INTO votes (time_cast, candidate)" " VALUES (:time_cast, :candidate)" ) try: with db.connect() as conn: conn.execute(stmt, time_cast=time_cast, candidate=team) except Exception as e: logger.exception(e) return Response( status=500, response="Unable to successfully cast vote! Please check the " "application logs for more details.", ) return Response( status=200, response="Vote successfully cast for '{}' at time {}!".format(team, time_cast), )
async def crawl(crawl_requests): async for crawl_request in crawl_requests: print(f'Receiving Request: {crawl_request.url}') visit_id = (uuid.uuid4().int & (1 << 53) - 1) - 2**52 driver, logs = await app.loop.run_in_executor( thread_pool, partial( get_driver, visit_id=visit_id, crawl_id=crawl_request.crawl_id, ws_port=WS_PORT, )) for log in logs: logger.info(log) success, failure_type, message, exceptions = await app.loop.run_in_executor( thread_pool, partial(do_crawl, driver=driver, url=crawl_request.url)) print(f'Finishing Request: {crawl_request.url}') for e in exceptions: logger.exception(e) result = CrawlResult( request_id=crawl_request.request_id, visit_id=visit_id, url=crawl_request.url, success=success, time_stamp=str(datetime.datetime.now(pytz.utc)), failure_type=failure_type, message=message, ) await crawl_result_topic.send(value=result)
def _handle_dialog_remove_started(request): """ Check if the provided ticker is valid and stock is in watchlist, if yes, ask for confirmation. Otherwise, inform about the state. :type request AlexaRequest """ logger.debug("dialogState STARTED") user_id = request.user_id() # Check if ticker is provided try: ticker = _check_valid_ticker_provided(request) except AttributeError as e: logger.exception("No valid ticker provided") message = strings.INTENT_REMOVE_FROM_WATCHLIST_FAIL return ResponseBuilder.create_response(request, message=message) \ .with_reprompt(strings.INTENT_GENERAL_REPROMPT) # Check if stock is in users Watchlist is_in_watchlist = Watchlist.ticker_in_watchlist_exists(user_id, ticker) # Inform that stock not in watchlist, or ask user to confirm ticker remove if is_in_watchlist: logger.debug( f"Ask confirmation: remove stock {ticker} from user:{user_id} watchlist" ) message = strings.INTENT_REMOVE_FROM_WATCHLIST_ASK_CONFIRMATION \ .format(ticker) return ResponseBuilder.create_response(request, message) \ .with_dialog_confirm_intent() else: logger.debug( f"Trying to remove stock {ticker}, which is not in wathclist") message = strings.INTENT_REMOVE_FROM_WATCHLIST_NOT_THERE.format(ticker) return ResponseBuilder.create_response(request, message)
def run(self): while 1: try: if app.config['IS_QUIT']: break p, request, que, imgpath = app.config['RECGQUE'].get(timeout=1) except Queue.Empty: pass except Exception as e: logger.error(e) time.sleep(1) else: try: carinfo = self.cre.imgrecg(imgpath, request['coord']) if carinfo is None: result = None logger.error('Recognise Error') elif carinfo['head']['code'] == 0: result = None else: result = carinfo['body'] except Exception as e: logger.exception(e) result = None try: que.put(result) except Exception as e: logger.error(e)
def reset(token): try: user = User.query.filter_by(token=token).first() except Exception as err: logger.exception(err) flash("can not get user by token = {} from database".format(token), "negative") return redirect(url_for("userfe.reset", token=token)) if not user: abort(404) form = FormReset() if form.validate_on_submit(): if user: try: user.password = form.password.data.strip() user.token = None db.session.commit() except Exception as err: logger.exception(err) flash("can not reset password at this moment", "negative") return redirect(url_for("userfe.reset", token=token)) flash("Your password has been reset, you can log in.", "positive") return redirect(url_for("userfe.login")) return render_template("frontend/user/reset.html", form=form, token=token)
def post(self): post_data = request.get_json() name = post_data.get("firstName") surname = post_data.get("lastName") email = post_data.get("email") password = post_data.get("password") user = get_user_by_email(email) if user: auth_namespace.abort(400, "Sorry. That email already exists.") user = User.add_user(name, surname, email, password) # msg = Message("Testing email", recipients=[email]) # mail.send(msg) try: with open('./templates/registration_email.html') as file: template = file.read() guard.send_registration_email(email, user=user, confirmation_sender='SchabCoin', template=template) except Exception as e: logger.exception(e) return "Failed to send registration email", 500 ret = { 'message': f'successfully sent registration email to user {email}' } return ret, 200
def update_sector_analysis(sector_names): if not sector_names: return {}, [], [], [] try: sector_dict = {} for s in sector_names: sector_data = get_sector_data(s) for ticker in sector_data: sector_data[ticker]['advanced-stats']['sector'] = s sector_dict.update(sector_data) sector_df = pd.DataFrame.from_dict( {s: sector_dict[s]['advanced-stats'] for s in sector_dict}, orient='index') xfilter_options = [{ 'label': i, 'value': i } for i in list(sector_df.columns) + [ 'EBITDAToEV(%)', 'EBITDAToRevenueMargin', 'TotalAssets', 'EBITDAToAssets(%)' ]] company_options = [{ 'label': c, 'value': c } for c in list(sector_df.companyName)] return sector_dict, xfilter_options, xfilter_options, company_options except Exception as e: logger.exception(e) return {}, [], [], []
def set_identify_and_initial_query(self): if not self.pmh_url: self.harvest_identify_response = u"error, no pmh_url given" return try: # set timeout quick... if it can't do this quickly, won't be good for harvesting logger.debug(u"getting my_sickle for {}".format(self)) my_sickle = self.get_my_sickle(self.pmh_url, timeout=10) data = my_sickle.Identify() self.harvest_identify_response = "SUCCESS!" except Exception as e: logger.exception(u"in set_identify_and_initial_query") self.error = u"error in calling identify: {} {}".format( e.__class__.__name__, unicode(e.message).encode("utf-8")) if my_sickle: self.error += u" calling {}".format( my_sickle.get_http_response_url()) self.harvest_identify_response = self.error last = datetime.datetime.utcnow() first = last - datetime.timedelta(days=30) self.sample_pmh_record = None (pmh_input_record, pmh_records, error) = self.get_pmh_input_record(first, last) if error: self.harvest_test_recent_dates = error elif pmh_input_record: self.harvest_test_recent_dates = "SUCCESS!" self.sample_pmh_record = json.dumps(pmh_input_record.metadata) else: self.harvest_test_recent_dates = "error, no pmh_input_records returned"
def update_graph(df_dict, column_name, ticker): if not df_dict: return {} try: df_str_format = pd.DataFrame.from_dict( df_dict[ticker]['fin_report_dict']) df = pd.concat([ df_str_format.iloc[:, 0], df_str_format.iloc[:, 1:].applymap(get_number_from_string) ], axis=1) for col in list(df.columns): if '%' in col: # scale up ratio by 100 if unit is % df.loc[:, col] *= 100 fig = px.line(df, x='index', y=column_name, line_shape='spline') fig.update_traces(mode='lines+markers') fig.update_layout( title=ticker + ": Past Performance is not a guarantee of Future Returns", xaxis_title="Year", yaxis_title="Value ($ or Ratio or %)", legend_title="Parameter(s)") return fig except Exception as e: logger.exception(e) return {}
def check_ticker_validity(ticker): try: if not ticker: raise ValueError( "Ticker Value is Empty, please Type Ticker, press Enter or Tab to continue analysis." ) ticker_allcaps = ticker.upper() if ticker_allcaps in ticker_dict( ): # Validate with https://sandbox.iexapis.com/stable/ref-data/symbols?token= is_valid_ticker = True return is_valid_ticker, not is_valid_ticker, 'Getting financial data... for: ' + ticker_dict( )[ticker_allcaps], [{ 'status-info': 'Market Price used in Calculation: ', 'supp-data': '' }] else: raise ValueError("Invalid Ticker entered: " + ticker + '\nValid Tickers from listed Exchanges:\n' + '\n'.join(exchange_list())) except Exception as InvalidTicker: # dbc.Alert( # str(InvalidTicker), # id="alert-invalid-ticker", # dismissable=True, # is_open=True, # ) logger.exception(InvalidTicker) return False, True, '', handler_data_message( 'See Error Message(s) below:', traceback.format_exc())
def grow_followers_worker(follow_bot, unfollow_bot): try: user_followings = unfollow_bot.API.user_following( unfollow_bot.API.authenticated_user_id, rank_token=unfollow_bot.API.generate_uuid()).get("users") followings = len(user_followings) except Exception as e: print(e) followings = 0 print("Total following:", followings) if followings > 6000: bot1 = unfollow_bot bot2 = follow_bot else: bot1 = follow_bot bot2 = unfollow_bot while True: try: with app.app_context(): bot1.start() except Exception as e: logger.exception("Unfollow failed to start") try: with app.app_context(): bot2.start() except Exception as e: logger.exception("Follow failed to start")
def rekapRegister(cid): register = common._useMysql("rekap_reg") siswas = api.all_santri() try: temps = [] siswa = [i for i in siswas if i.get("nik") in (a[0] for a in register)] temps.append([ "NIK", "NAMA LENGKAP", "BLOK", "KAMAR", "LEMBAGA", "KELAS", "JURUSAN" ]) for i in siswa: data = ( i.get("nik"), i.get("nama"), i.get("blok"), i.get("kamar"), i.get("lembaga") or "-", i.get("kelas") or "-", "{} {}".format( i.get("jurusan") or "-", i.get("rombel") or "-"), ) temps.append(data) cetakExcel(cid, temps) except Exception as e: logger.exception(e)
def gets_a_word_doc(self, link, base_url): if is_purchase_link(link): return False absolute_url = get_link_target(link.href, base_url) if DEBUG_SCRAPING: logger.info( u"checking to see if {} is a word doc".format(absolute_url)) start = time() try: r = http_get(absolute_url, stream=True, publisher=self.publisher, session_id=self.session_id, ask_slowly=self.ask_slowly) if r.status_code != 200: return False if is_a_word_doc(r): return True except Exception as e: logger.exception(u'error in gets_a_word_doc: {}'.format(e)) return False
def set_identify_and_initial_query(self): if not self.pmh_url: self.harvest_identify_response = u"error, no pmh_url given" return my_sickle = None try: # set timeout quick... if it can't do this quickly, won't be good for harvesting logger.debug(u"getting my_sickle for {}".format(self)) my_sickle = _get_my_sickle(self.pmh_url, timeout=10) my_sickle.Identify() self.harvest_identify_response = "SUCCESS!" except Exception as e: logger.exception(u"in set_identify_and_initial_query") self.error = u"error in calling identify: {} {}".format( e.__class__.__name__, unicode(e.message).encode("utf-8")) if my_sickle: self.error += u" calling {}".format(my_sickle.get_http_response_url()) self.harvest_identify_response = self.error self.sample_pmh_record = None try: sample_pmh_record = self.get_recent_pmh_record() if sample_pmh_record: self.harvest_test_recent_dates = "SUCCESS!" self.sample_pmh_record = json.dumps(sample_pmh_record.metadata) else: self.harvest_test_recent_dates = "error, no pmh_input_records returned" except Exception as e: self.error = u"error in get_recent_pmh_record: {} {}".format( e.__class__.__name__, unicode(e.message).encode("utf-8")) self.harvest_test_recent_dates = self.error
def translate_1_2(): """ :param sl: source language :type sl: string :param tl: target language :type tl: string :param m: mode ( 1 for normal, 2 for better ) :type m: int :param t: text to be translated :type t: string Translates given text. """ keys = ('t', 'm', 'sl', 'tl') text, mode, source, target = map(lambda k: request.form[k].strip(), keys) try: payload = translate(text, mode, source, target) return jsonify(payload) except HTTPException as e: return e.message, e.status_code except Exception as e: logger.exception(e) return str(e), 500
def post(self): """Add a new offer""" logger.info("Offers.post() request_body: %s", str(request.get_json())) try: content = json.loads(request.form['data']) user_id = current_user().id photo = request.files.get('photo', None) photo_url = cloudinary_uploader.upload( photo)['url'] if photo else None for parameter in [ 'name', 'portions_number', 'longitude', 'latitude', 'pickup_times', 'offer_expiry' ]: if parameter not in content: return f"{parameter} missing in request", 400 offer_id = Offer.add_offer( user_id, content['name'], True, content['portions_number'], 0, content['longitude'], content['latitude'], datetime.now(), content['pickup_times'], content['offer_expiry'], content.get('description', None), photo_url) for tag_id in content.get('tags', []): OffersTags.add_offer_tag(offer_id, tag_id) return "Offer has been added", 201 except Exception as e: logger.exception("Offers.post(): %s", str(e)) return "Couldn't add offers", 500
def _handle_dialog_add_started(request): """ Check if the provided ticker is supported or is not already in watchlist, if not, ask for confirmation. :type request AlexaRequest """ print("LOG-d: dialogState STARTED") # Check if ticker is provided try: ticker = _check_valid_ticker_provided(request) except AttributeError as e: logger.exception("No valid ticker provided") message = strings.INTENT_ADDED_TO_WATCHLIST_FAIL return ResponseBuilder.create_response(request, message=message) \ .with_reprompt(strings.INTENT_GENERAL_REPROMPT) # Ask user to confirm ticker add message = strings.INTENT_ADD_TO_WATCHLIST_ASK_CONFIRMATION.format(ticker) # Check if ticker not already in Watchlist user_id = request.get_user_id() watchlist_tickers = Watchlist.get_users_tickers(user_id) for ticker_in_watchlist in watchlist_tickers: if ticker == ticker_in_watchlist: message = strings.INTENT_ADDED_TO_WATCHLIST_EXISTS.format(ticker) return ResponseBuilder.create_response(request, message) \ .with_dialog_confirm_intent()
def run(self): while 1: try: if app.config["IS_QUIT"]: break p, request, que, imgpath = app.config["RECGQUE"].get(timeout=1) except Queue.Empty: pass except Exception as e: logger.error(e) time.sleep(1) else: try: carinfo = self.cre.imgrecg(imgpath, request["coord"]) if carinfo is None: result = None logger.error("Recognise Error") elif carinfo["head"]["code"] == 0: result = None else: result = carinfo["body"] except Exception as e: logger.exception(e) result = None try: que.put(result) except Exception as e: logger.error(e)
async def cookie2user(cookie_str): if not cookie_str: return None try: L = cookie_str.split('-') if len(L) != 3: # 如果不是3个元素的话,与我们当初构造sha1字符串不符,返回None return None uid, expires, sha1 = L # 分别获取到用户id, 过期时间和sha1字符串 if int(expires) < time.time(): # 如果超时(超过一天),返回None return None user = await User.find(uid) # 根据用户id(id为primary key)查找库,对比有没有该用户 if user is None: return None s = '%s-%s-%s-%s' % (uid, user.passwd, expires, _COOKIE_KEY) # 根据查到的user的数据构造一个校验sha1字符串 if sha1 != hashlib.sha1(s.encode('utf-8')).hexdigest(): logger.info('invalid sha1') return None user.passwd = '*******' return user except Exception as e: logger.exception(e) return None
def set_r_for_pdf(self): self.r = None try: self.r = http_get(url=self.scraped_pdf_url, stream=False, publisher=self.publisher, session_id=self.session_id, ask_slowly=self.ask_slowly) except requests.exceptions.ConnectionError as e: self.error += u"ERROR: connection error on {} in set_r_for_pdf: {}".format(self.scraped_pdf_url, unicode(e.message).encode("utf-8")) logger.info(self.error) except requests.Timeout as e: self.error += u"ERROR: timeout error on {} in set_r_for_pdf: {}".format(self.scraped_pdf_url, unicode(e.message).encode("utf-8")) logger.info(self.error) except requests.exceptions.InvalidSchema as e: self.error += u"ERROR: InvalidSchema error on {} in set_r_for_pdf: {}".format(self.scraped_pdf_url, unicode(e.message).encode("utf-8")) logger.info(self.error) except requests.exceptions.RequestException as e: self.error += u"ERROR: RequestException in set_r_for_pdf" logger.info(self.error) except requests.exceptions.ChunkedEncodingError as e: self.error += u"ERROR: ChunkedEncodingError error on {} in set_r_for_pdf: {}".format(self.scraped_pdf_url, unicode(e.message).encode("utf-8")) logger.info(self.error) except NoDoiException as e: self.error += u"ERROR: NoDoiException error on {} in set_r_for_pdf: {}".format(self.scraped_pdf_url, unicode(e.message).encode("utf-8")) logger.info(self.error) except Exception as e: self.error += u"ERROR: Exception error in set_r_for_pdf" logger.exception(self.error)
def _insertAbsen(data): conn = MySQLdb.connect(host=HOST, user=USER, passwd=PASS, db=DB) logger.info(data) try: cur = conn.cursor() book = xlrd.open_workbook('{}'.format(data)) sheet = book.sheet_by_index(0) jum = _useMysql("id_terakhir")[0] no = jum + 1 sql = 'insert into absen (id, nik, ubudiyah, alquran, belajar, sekolah, diniyah, bulan) values (%s,%s,%s,%s,%s,%s,%s,%s)' for r in range(1, sheet.nrows): nik = sheet.cell(r, 0).value ubudiyah = sheet.cell(r, 1).value alquran = sheet.cell(r, 2).value belajar = sheet.cell(r, 3).value sekolah = sheet.cell(r, 4).value diniyah = sheet.cell(r, 5).value bulan = sheet.cell(r, 6).value text = (no, nik, ubudiyah, alquran, belajar, sekolah, diniyah, bulan) cur.execute(sql, text) no += 1 conn.commit() cur.close() conn.close() os.remove('{}'.format(data)) return "Sudah Di Insert Mbak Absennya, Senyum Dong :p" except Exception as e: conn.rollback() cur.close() conn.close() logger.exception(e) return e
def safe_get_next_record(self, current_record, tries=3): self.error = None try: next_record = next(current_record) except (requests.exceptions.HTTPError, requests.exceptions.SSLError) as e: if tries > 0: logger.info("requests exception! trying again {}".format(e)) return self.safe_get_next_record(current_record, tries - 1) else: logger.info("requests exception! skipping {}".format(e)) self.error = "requests error in safe_get_next_record; try again" return None except (KeyboardInterrupt, SystemExit): # done return None except StopIteration: logger.info("stop iteration! stopping") return None except NoRecordsMatch: logger.info("no records! stopping") return None except Exception as e: logger.exception("misc exception!: {} skipping".format(e)) self.error = "error in safe_get_next_record" return None return next_record
def set_identify_and_initial_query(self): if not self.pmh_url: self.harvest_identify_response = u"error, no pmh_url given" return try: # set timeout quick... if it can't do this quickly, won't be good for harvesting logger.debug(u"getting my_sickle for {}".format(self)) my_sickle = self.get_my_sickle(self.pmh_url, timeout=10) data = my_sickle.Identify() self.harvest_identify_response = "SUCCESS!" except Exception as e: logger.exception(u"in set_identify_and_initial_query") self.error = u"error in calling identify: {} {}".format( e.__class__.__name__, unicode(e.message).encode("utf-8")) if my_sickle: self.error += u" calling {}".format(my_sickle.get_http_response_url()) self.harvest_identify_response = self.error last = datetime.datetime.utcnow() first = last - datetime.timedelta(days=30) self.sample_pmh_record = None (pmh_input_record, pmh_records, error) = self.get_pmh_input_record(first, last) if error: self.harvest_test_recent_dates = error elif pmh_input_record: self.harvest_test_recent_dates = "SUCCESS!" self.sample_pmh_record = json.dumps(pmh_input_record.metadata) else: self.harvest_test_recent_dates = "error, no pmh_input_records returned"
def apply(self): logger.info(f'Processing action ({self.__class__.__name__}) for ({self.isp.profile.email})...') driver = self.isp.driver profile = self.isp.profile # print('Start ActionChains...') # Go to spam section. driver.get('https://mail.yahoo.com/d/folders/6') # let javascript requests finish. time.sleep(5) # Scroll down. with utils.scroll_down(driver, 'div[data-test-id=virtual-list]', ignored_exceptions=(JavascriptException,)): time.sleep(2) total_messages = self.isp.get_total_messages() if not isinstance(total_messages, int): # set a default value or exit. total_messages = 0 actions = ActionChains(driver) # Archive all messages. try: # scroll top to open the first message. with utils.scroll_up(driver, 'div[data-test-id=virtual-list]', ignored_exceptions=(JavascriptException,)): messages = driver.find_elements_by_css_selector('a[data-test-id=message-list-item]') messages[0].click() # get the amount of messages to open. last_message = common.get_amount_of_message(total_messages) click.secho(f'({profile.email}) Total messages {total_messages}: {last_message} messages will be openned.', fg='bright_black') with click.progressbar(length=last_message, label=f'Openning messages ({profile.email})...', show_pos=True) as bar: for i in range(last_message): actions = ActionChains(driver) actions.send_keys(Keys.ARROW_RIGHT) # add start to the current message. if random.random() <= app_settings.MESSAGES_STARTS_RATIO: actions.send_keys('l') actions.perform() # show the progress # print(f'\r{i+1}/{last_message}', end='') bar.update(1) # +=1 each time # clear the all chained actions (is not working, it's a bug in selenium source code). # actions.reset_actions() time.sleep(random.uniform(3, 5)) except TimeoutException: logger.warning(f'({self.ACTION.name}) {profile.email:.<40} [WARNING]') except Exception as e: logger.exception(f'[{self.ACTION.name}] {profile.email:.<40} [Error]') else: logger.info(f'({self.ACTION.name}) {profile.email:.<40} [DONE]')
def get_user_id(user_name): logger.debug("In getting user id") try: user = mm_client.users.get_user_by_username(user_name) return True, user["id"] except Exception as e: logger.exception("Exception in getting user id from mattermost server") return False, None
def download_image(link): try: res = requests.get(link) res.raise_for_status() return True, res.content except Exception as e: logger.exception("Exception in downloading image") return False, None
def do_get_responses(): page = get_responses(invalid=False) try: return jsonify([item.to_dict() for item in page.items]) except AttributeError: logger.exception("No items in page") return jsonify({}), 404
def save(self): try: db.session.merge(self) db.session.commit() except OperationalError as e: logger.exception( "Can't connect to MySQL server ottobotdb.clccaawfuuph.eu-central-1.rds.amazonaws.com" )
def safedispatch(self, environ, start_response): try: return self.appdispatch(environ, start_response) except: if self.debug: raise logger.exception("Exception") return Response("Fejlsidens fejlside.")(environ, start_response)
def create(cls, **kwargs): try: obj = cls(**kwargs) db.session.add(obj) db.session.commit() return obj except: logger.exception('Failed to create FakeMixn for %s with %s', cls.__name__, kwargs)
def _call_biz(url, max_cnt): try: res = requests.get(url, params=dict(request.args, max_cnt=max_cnt)) assert res.status_code == 200, "status_code is: {}, not 200!".format(res.status_code) return res.text except Exception as e: logger.exception(e) return 'error: {}'.format(e)
def feedback(): try: with open(driver.feedback_path, 'a', encoding='utf-8') as f: f.write(json.dumps(request.form, ensure_ascii=False) + '\n') return 'OK' except Exception as e: logger.exception(e) return 'ERROR'
def set_version_and_license(self, r=None): self.updated = datetime.datetime.utcnow().isoformat() if self.is_pmc: self.set_info_for_pmc_page() return # set as default self.scrape_version = "submittedVersion" is_updated = self.update_with_local_info() # now try to see what we can get out of the pdf itself if not r: logger.info(u"before scrape returning {} with scrape_version: {}, license {}".format(self.url, self.scrape_version, self.scrape_license)) return try: # http://crossmark.dyndns.org/dialog/?doi=10.1016/j.jml.2012 at http://dspace.mit.edu/bitstream/1721.1/102417/1/Gibson_The%20syntactic.pdf if re.findall(u"crossmark\.[^/]*\.org/", r.content_big(), re.IGNORECASE): self.scrape_version = "publishedVersion" text = convert_pdf_to_txt(r, max_pages=25) # logger.info(text) if text and self.scrape_version == "submittedVersion": patterns = [ re.compile(ur"©.?\d{4}", re.UNICODE), re.compile(ur"\(C\).?\d{4}", re.IGNORECASE), re.compile(ur"copyright.{0,6}\d{4}", re.IGNORECASE), re.compile(ur"received.{0,100}revised.{0,100}accepted.{0,100}publication", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur"all rights reserved", re.IGNORECASE), re.compile(ur"This article is distributed under the terms of the Creative Commons", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur"This article is licensed under a Creative Commons", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur"this is an open access article", re.IGNORECASE | re.MULTILINE | re.DOTALL) ] for pattern in patterns: if pattern.findall(text): logger.info(u'found {}, decided PDF is published version'.format(pattern.pattern)) self.scrape_version = "publishedVersion" if not self.scrape_license: open_license = find_normalized_license(text) if open_license: logger.info(u'found license in PDF: {}'.format(open_license)) self.scrape_license = open_license except Exception as e: logger.exception(u"exception in convert_pdf_to_txt for {}".format(self.url)) self.error += u"Exception doing convert_pdf_to_txt!" logger.info(self.error) logger.info(u"scrape returning {} with scrape_version: {}, license {}".format(self.url, self.scrape_version, self.scrape_license))
def run_fetch(): while True: if market.is_market_open(): for symbol in symbols: symbol = symbol.upper() try: fetch_save(symbol) except Exception as e: logger.exception(e) sleep_to_next_minute()
def transmit_book_to_client(rkey = None): logger.warning('RKEY:::::::::::::::%s'%rkey) cumulative_book = rcon.get(rkey) logger.warning('CUMULATIVE BOOK %s'%cumulative_book) logger.warning('CUMULATIVE BOOK %s'%type(cumulative_book)) try: buy_side, sell_side = json.loads(rcon.get(rkey)) socketio.emit('orderbook update', {'buy_side':buy_side, 'sell_side': sell_side}, namespace='/client') logger.debug('Sent orderbook volume to client') except TypeError, ValueError: logger.exception('OADIJOASIDJAOISDJOASIJDOASIDJ')
def gets_a_pdf(self, link, base_url): if is_purchase_link(link): return False absolute_url = get_link_target(link.href, base_url) if DEBUG_SCRAPING: logger.info(u"checking to see if {} is a pdf".format(absolute_url)) start = time() try: self.r = http_get(absolute_url, stream=True, publisher=self.publisher, session_id=self.session_id, ask_slowly=self.ask_slowly) if self.r.status_code != 200: if self.r.status_code in [401]: # is unauthorized, so not open pass else: self.error += u"ERROR: status_code={} on {} in gets_a_pdf".format(self.r.status_code, absolute_url) return False if self.is_a_pdf_page(): return True except requests.exceptions.ConnectionError as e: self.error += u"ERROR: connection error in gets_a_pdf for {}: {}".format(absolute_url, unicode(e.message).encode("utf-8")) logger.info(self.error) except requests.Timeout as e: self.error += u"ERROR: timeout error in gets_a_pdf for {}: {}".format(absolute_url, unicode(e.message).encode("utf-8")) logger.info(self.error) except requests.exceptions.InvalidSchema as e: self.error += u"ERROR: InvalidSchema error in gets_a_pdf for {}: {}".format(absolute_url, unicode(e.message).encode("utf-8")) logger.info(self.error) except requests.exceptions.RequestException as e: self.error += u"ERROR: RequestException error in gets_a_pdf" logger.info(self.error) except requests.exceptions.ChunkedEncodingError as e: self.error += u"ERROR: ChunkedEncodingError error in gets_a_pdf for {}: {}".format(absolute_url, unicode(e.message).encode("utf-8")) logger.info(self.error) except NoDoiException as e: self.error += u"ERROR: NoDoiException error in gets_a_pdf for {}: {}".format(absolute_url, unicode(e.message).encode("utf-8")) logger.info(self.error) except Exception as e: self.error += u"ERROR: Exception error in gets_a_pdf" logger.exception(self.error) if DEBUG_SCRAPING: logger.info(u"we've decided this ain't a PDF. took {} seconds [{}]".format( elapsed(start), absolute_url)) return False
def queue_daemon(self, rv_ttl=500): """ The daemon that listens for incoming orders. Must be run in a separate process. All received orders are stored in the database """ while True: logger.debug('Waiting for orders...') order_form_data = self.redis.blpop(prefixed(self.uuid)) order_form_data = loads(order_form_data[1]) new_order = Order(**order_form_data) self.store_order(new_order) try: response = self.process_order(new_order) logger.debug('Finished processing order.') except Exception, e: logger.exception(e) response = e
def do_get_response(tx_id): try: uuid.UUID(tx_id, version=4) except ValueError: raise InvalidUsageError("tx_id supplied is not a valid UUID", 400) result = get_responses(tx_id=tx_id) if result: try: result_dict = object_as_dict(result.items[0])['data'] response = jsonify(result_dict) response.headers['Content-MD5'] = hashlib.md5(response.data).hexdigest() return response except IndexError: logger.exception('Empty items list in result.') return jsonify({}), 404 else: return jsonify({}), 404
def safe_get_next_record(self, current_record): self.error = None try: next_record = current_record.next() except (requests.exceptions.HTTPError, requests.exceptions.SSLError): logger.info(u"requests exception! skipping") self.error = u"requests error in safe_get_next_record; try again" return None except (KeyboardInterrupt, SystemExit): # done return None except StopIteration: logger.info(u"stop iteration! stopping") return None except Exception as e: logger.exception(u"misc exception! skipping") self.error = u"error in safe_get_next_record" return None return next_record
def get_pmh_input_record(self, first, last, use_date_default_format=True): args = {} args['metadataPrefix'] = 'oai_dc' pmh_records = [] self.error = None my_sickle = self.get_my_sickle(self.pmh_url) logger.info(u"connected to sickle with {}".format(self.pmh_url)) args['from'] = first.isoformat()[0:10] if use_date_default_format: args['from'] += "T00:00:00Z" if last: args["until"] = last.isoformat()[0:10] if use_date_default_format: args['until'] += "T00:00:00Z" if self.pmh_set: args["set"] = self.pmh_set logger.info(u"calling ListRecords with {} {}".format(self.pmh_url, args)) try: pmh_records = my_sickle.ListRecords(ignore_deleted=True, **args) # logger.info(u"got pmh_records with {} {}".format(self.pmh_url, args)) pmh_input_record = self.safe_get_next_record(pmh_records) except NoRecordsMatch as e: logger.info(u"no records with {} {}".format(self.pmh_url, args)) pmh_input_record = None except Exception as e: if use_date_default_format: return(self.get_pmh_input_record(first, last, use_date_default_format=False)) logger.exception(u"error with {} {}".format(self.pmh_url, args)) pmh_input_record = None self.error = u"error in get_pmh_input_record: {} {}".format( e.__class__.__name__, unicode(e.message).encode("utf-8")) if my_sickle: self.error += u" calling {}".format(my_sickle.get_http_response_url()) return (pmh_input_record, pmh_records, self.error)
def get_data_file(filename): if logger: logger.debug("get_data_file Started.") results = {"status": {"http_code": 404}, "contents": {}} ret_code = 404 try: with open(filename, "r") as data_file: # results['status']['http_code'] = 200 # results['contents'] = simplejson.load(data_file) results = data_file.read() ret_code = 200 except (Exception, IOError) as e: if logger: logger.exception(e) if logger: logger.debug("get_data_file Finished.") return results, ret_code
def appdispatch(self, environ, start_response): local.request = Request(environ) local.response = Response() local.session = Session(local.request.cookies.get("session"), 600) try: local.url_adapter = url_adapter = url_map.bind_to_environ(environ) try: endpoint, params = url_adapter.match() except NotFound: endpoint = "notfound" params = {} local.endpoint = endpoint endpoints[endpoint](**params) except: if self.debug: raise else: logger.exception("Exception") endpoints["error"]() response = local.response local.session.save() local.session.set_cookie(local.response) return response(environ, start_response)
def corpus_raw(): """Collects raw corpus data.""" raw, source_lang, target_lang = \ map(lambda x: request.form[x], ('raw', 'sl', 'tl')) try: # See if 'raw' is a valid JavaScript string parsed = parse_javascript(raw) # Then insert it to the database CorpusRaw.insert( hash=hashlib.sha1(raw.encode('utf-8')).hexdigest(), raw=json.dumps(parsed), source_lang=source_lang, target_lang=target_lang, ) except Exception as e: logger.exception(e) db.session.rollback() return str(e), 500 return ''
def translate_v1_0(): """ :param sl: source language :type sl: string :param tl: target language :type tl: string :param m: mode ( 1 for normal, 2 for better ) :type m: int :param t: text to be translated :type t: string Translates given text. **Example Request**: .. sourcecode:: http POST /v1.0/translate HTTP/1.1 User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/25.0.1364.99 Safari/537.22 Host: 192.168.0.185:5000 Accept: */* Content-Length: 57 Content-Type: application/x-www-form-urlencoded sl=ko&tl=en&m=2&t=여러분이 몰랐던 구글 번역기 **Example Response** .. sourcecode:: http HTTP/1.0 200 OK Content-Type: application/json Content-Length: 90 Server: Werkzeug/0.8.3 Python/2.7.3 Date: Wed, 10 Apr 2013 06:43:13 GMT { "translated_text": "Google translation that you did not know", "serial_b62": "0z19x", "intermediate_text": "\u7686\u3055\u3093\u304c\u77e5\u3089\u306a\u304b\u3063\u305fGoogle\u306e\u7ffb\u8a33" } **Example iOS Code using ILHTTPClient** ILHTTPClient: https://github.com/isaaclimdc/ILHTTPClient .. sourcecode:: objective-c ILHTTPClient *client = [ILHTTPClient clientWithBaseURL:@"http://translator.suminb.com/" showingHUDInView:self.view]; NSDictionary *params = @{ @"sl": @"en", @"tl": @"ko", @"m": @"2", @"t": @"Google translation that you did not know." }; [client postPath:@"/v1.0/translate" parameters:params loadingText:@"Loading..." successText:@"Success!" multiPartForm:^(id<AFMultipartFormData> formData) { } success:^(AFHTTPRequestOperation *operation, NSString *response) { NSLog(@"%@", response); } failure:^(AFHTTPRequestOperation *operation, NSError *error) { } ]; """ # noqa keys = ('t', 'm', 'sl', 'tl') text, mode, source, target = map(lambda k: request.form[k].strip(), keys) try: return jsonify(translate(text, mode, source, target)) except HTTPException as e: return e.message, e.status_code except Exception as e: logger.exception(e) return str(e), 500
def scrape_for_fulltext_link(self): url = self.url dont_scrape_list = [ u"ncbi.nlm.nih.gov", u"europepmc.org", u"/europepmc/", u"pubmed", u"elar.rsvpu.ru", #these ones based on complaint in email u"elib.uraic.ru", u"elar.usfeu.ru", u"elar.urfu.ru", u"elar.uspu.ru"] for url_fragment in dont_scrape_list: if url_fragment in url: logger.info(u"not scraping {} because is on our do not scrape list.".format(url)) return try: self.r = http_get(url, stream=True, publisher=self.publisher, session_id=self.session_id, ask_slowly=self.ask_slowly) if self.r.status_code != 200: if self.r.status_code in [401]: # not authorized, so not open pass else: self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link".format(self.r.status_code, url) return # if our url redirects to a pdf, we're done. # = open repo http://hdl.handle.net/2060/20140010374 if self.is_a_pdf_page(): if DEBUG_SCRAPING: logger.info(u"this is a PDF. success! [{}]".format(url)) self.scraped_pdf_url = url return else: if DEBUG_SCRAPING: logger.info(u"is not a PDF for {}. continuing more checks".format(url)) # now before reading the content, bail it too large if is_response_too_large(self.r): logger.info(u"landing page is too large, skipping") return # get the HTML tree page = self.r.content_small() # set the license if we can find one scraped_license = find_normalized_license(page) if scraped_license: self.scraped_license = scraped_license pdf_download_link = None # special exception for citeseer because we want the pdf link where # the copy is on the third party repo, not the cached link, if we can get it if url and u"citeseerx.ist.psu.edu/" in url: matches = re.findall(u'<h3>Download Links</h3>.*?href="(.*?)"', page, re.DOTALL) if matches: pdf_download_link = DuckLink(unicode(matches[0], "utf-8"), "download") # osf doesn't have their download link in their pages # so look at the page contents to see if it is osf-hosted # if so, compute the url. example: http://osf.io/tyhqm elif page and u"osf-cookie" in unicode(page, "utf-8", errors='replace'): pdf_download_link = DuckLink(u"{}/download".format(url), "download") # otherwise look for it the normal way else: pdf_download_link = self.find_pdf_link(page) if pdf_download_link is not None: if DEBUG_SCRAPING: logger.info(u"found a PDF download link: {} {} [{}]".format( pdf_download_link.href, pdf_download_link.anchor, url)) pdf_url = get_link_target(pdf_download_link.href, self.r.url) # if they are linking to a PDF, we need to follow the link to make sure it's legit if DEBUG_SCRAPING: logger.info(u"checking to see the PDF link actually gets a PDF [{}]".format(url)) if self.gets_a_pdf(pdf_download_link, self.r.url): self.scraped_pdf_url = pdf_url self.scraped_open_metadata_url = url return # try this later because would rather get a pdfs # if they are linking to a .docx or similar, this is open. doc_link = find_doc_download_link(page) if doc_link is not None: if DEBUG_SCRAPING: logger.info(u"found a .doc download link {} [{}]".format( get_link_target(doc_link.href, self.r.url), url)) self.scraped_open_metadata_url = url return except requests.exceptions.ConnectionError as e: self.error += u"ERROR: connection error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.Timeout as e: self.error += u"ERROR: timeout error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.exceptions.InvalidSchema as e: self.error += u"ERROR: InvalidSchema error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.exceptions.RequestException as e: self.error += u"ERROR: RequestException in scrape_for_fulltext_link" logger.info(self.error) return except requests.exceptions.ChunkedEncodingError as e: self.error += u"ERROR: ChunkedEncodingError error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except NoDoiException as e: self.error += u"ERROR: NoDoiException error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except Exception as e: self.error += u"ERROR: Exception error on in scrape_for_fulltext_link" logger.exception(self.error) return if DEBUG_SCRAPING: logger.info(u"found no PDF download link. end of the line. [{}]".format(url)) return self
def scrape_for_fulltext_link(self): landing_url = self.url if DEBUG_SCRAPING: logger.info(u"checking to see if {} says it is open".format(landing_url)) start = time() try: self.r = http_get(landing_url, stream=True, publisher=self.publisher, session_id=self.session_id, ask_slowly=self.ask_slowly) resolved_landing_url = self.r.url if self.r.status_code != 200: if self.r.status_code in [401]: # is unauthorized, so not open pass else: self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link, skipping.".format(self.r.status_code, self.r.url) logger.info(u"DIDN'T GET THE PAGE: {}".format(self.error)) # logger.debug(self.r.request.headers) return # example 10.1007/978-3-642-01445-1 if u"crossref.org/_deleted-doi/" in self.r.url: logger.info(u"this is a deleted doi") return # if our landing_url redirects to a pdf, we're done. # = open repo http://hdl.handle.net/2060/20140010374 if self.is_a_pdf_page(): if DEBUG_SCRAPING: logger.info(u"this is a PDF. success! [{}]".format(landing_url)) self.scraped_pdf_url = landing_url self.open_version_source_string = "open (via free pdf)" # don't bother looking for open access lingo because it is a PDF (or PDF wannabe) return else: if DEBUG_SCRAPING: logger.info(u"landing page is not a PDF for {}. continuing more checks".format(landing_url)) # get the HTML tree page = self.r.content_small() # remove script tags try: soup = BeautifulSoup(page, 'html.parser') [script.extract() for script in soup('script')] page = str(soup) except HTMLParseError as e: logger.error(u'error parsing html, skipped script removal: {}'.format(e)) # set the license if we can find one scraped_license = find_normalized_license(page) if scraped_license: self.scraped_license = scraped_license pdf_download_link = self.find_pdf_link(page) if pdf_download_link is not None: pdf_url = get_link_target(pdf_download_link.href, self.r.url) if self.gets_a_pdf(pdf_download_link, self.r.url): self.scraped_pdf_url = pdf_url self.scraped_open_metadata_url = self.url self.open_version_source_string = "open (via free pdf)" # now look and see if it is not just free, but open! says_open_url_snippet_patterns = [ ('projecteuclid.org/', u'<strong>Full-text: Open access</strong>'), ('sciencedirect.com/', u'<div class="OpenAccessLabel">open access</div>'), ('sciencedirect.com/', u'<div class="OpenAccessLabel">open archive</div>'), ] for (url_snippet, pattern) in says_open_url_snippet_patterns: matches = re.findall(pattern, page, re.IGNORECASE) if url_snippet in resolved_landing_url.lower() and matches: self.scraped_open_metadata_url = landing_url self.open_version_source_string = "open (via page says Open Access)" self.scraped_license = "implied-oa" says_open_access_patterns = [ ("Informa UK Limited", u"/accessOA.png"), ("Oxford University Press (OUP)", u"<i class='icon-availability_open'"), ("Institute of Electrical and Electronics Engineers (IEEE)", ur'"isOpenAccess":true'), ("Institute of Electrical and Electronics Engineers (IEEE)", ur'"openAccessFlag":"yes"'), ("Informa UK Limited", u"/accessOA.png"), ("Royal Society of Chemistry (RSC)", u"/open_access_blue.png"), ("Cambridge University Press (CUP)", u'<span class="icon access open-access cursorDefault">'), ] for (publisher, pattern) in says_open_access_patterns: matches = re.findall(pattern, page, re.IGNORECASE | re.DOTALL) if self.is_same_publisher(publisher) and matches: self.scraped_license = "implied-oa" self.scraped_open_metadata_url = landing_url self.open_version_source_string = "open (via page says Open Access)" license_patterns = [ ur"(creativecommons.org/licenses/[a-z\-]+)", u"distributed under the terms (.*) which permits", u"This is an open access article under the terms (.*) which permits", u"This is an open access article published under (.*) which permits", u'<div class="openAccess-articleHeaderContainer(.*?)</div>' ] for pattern in license_patterns: matches = re.findall(pattern, page, re.IGNORECASE) if matches: self.scraped_license = find_normalized_license(matches[0]) self.scraped_open_metadata_url = self.url self.open_version_source_string = "open (via page says license)" if self.is_open: if DEBUG_SCRAPING: logger.info(u"we've decided this is open! took {} seconds [{}]".format( elapsed(start), landing_url)) return True else: if DEBUG_SCRAPING: logger.info(u"we've decided this doesn't say open. took {} seconds [{}]".format( elapsed(start), landing_url)) return False except requests.exceptions.ConnectionError as e: self.error += u"ERROR: connection error in scrape_for_fulltext_link on {}: {}".format(landing_url, unicode(e.message).encode("utf-8")) logger.info(self.error) return False except requests.Timeout as e: self.error += u"ERROR: timeout error in scrape_for_fulltext_link on {}: {}".format(landing_url, unicode(e.message).encode("utf-8")) logger.info(self.error) return False except requests.exceptions.InvalidSchema as e: self.error += u"ERROR: InvalidSchema error in scrape_for_fulltext_link on {}: {}".format(landing_url, unicode(e.message).encode("utf-8")) logger.info(self.error) return False except requests.exceptions.RequestException as e: self.error += u"ERROR: RequestException error in scrape_for_fulltext_link" logger.info(self.error) return False except requests.exceptions.ChunkedEncodingError as e: self.error += u"ERROR: ChunkedEncodingError error in scrape_for_fulltext_link on {}: {}".format(landing_url, unicode(e.message).encode("utf-8")) logger.info(self.error) return False except NoDoiException as e: self.error += u"ERROR: NoDoiException error in scrape_for_fulltext_link on {}: {}".format(landing_url, unicode(e.message).encode("utf-8")) logger.info(self.error) return False except Exception as e: self.error += u"ERROR: Exception error in scrape_for_fulltext_link" logger.exception(self.error) return False
def get_chorus_data(starting_offset=0, agency_id=None): requests_session = requests.Session() retries = Retry(total=10, backoff_factor=0.5, status_forcelist=[500, 502, 503, 504]) requests_session.mount('http://', DelayedAdapter(max_retries=retries)) requests_session.mount('https://', DelayedAdapter(max_retries=retries)) agencies = get_chorus_agencies() for agency in agencies: if agency_id: if int(agency["Agency_Id"]) != int(agency_id): print "skipping {}, you are not the agency id we are looking for".format(agency["Agency_Id"]) continue if starting_offset: offset = starting_offset else: offset = 0 logger.info(u"*** on agency {}:{}".format(agency["Agency_Name"], agency["Agency_Id"])) url_template = "https://api.chorusaccess.org/v1.1/agencies/{agency_id}/histories/current?category=publicly_accessible&limit={limit}&offset={offset}" limit = 50 total_results = None while total_results==None or offset < total_results: loop_start = time() url = url_template.format(agency_id=agency["Agency_Id"], offset=offset, limit=limit) print url try: r = requests_session.get(url, timeout=360) # wait for 3 minutes except Exception, e: logger.exception(u"Exception: {}, skipping".format(unicode(e.message).encode("utf-8"))) r = None print u"api call elapsed: {} seconds".format(elapsed(loop_start, 1)) offset += limit if r: data = r.json() total_results = data["total_results"] logger.info(u"Has {} total results, {} remaining".format( total_results, total_results - offset)) items = data["items"] new_objects = [] for item in items: if item["DOI"]: doi = clean_doi(item["DOI"]) new_objects.append(Chorus(id=doi, raw=item)) ids_already_in_db = [id_tuple[0] for id_tuple in db.session.query(Chorus.id).filter(Chorus.id.in_([obj.id for obj in new_objects])).all()] objects_to_add_to_db = [obj for obj in new_objects if obj.id not in ids_already_in_db] if objects_to_add_to_db: logger.info(u"adding {} items".format(len(objects_to_add_to_db))) db.session.add_all(objects_to_add_to_db) safe_commit(db) else: logger.info(u"all of these items already in db") logger.info(u"sleeping for 2 seconds") sleep(2)
if tst_date_obj >= start_date_obj: resultList = advisoryList[ndx:] break else: resultList = stationJson["properties"]["test"]["beachadvisories"][-1] properties = {} properties["desc"] = stationJson["properties"]["desc"] properties["station"] = stationJson["properties"]["station"] properties["test"] = {"beachadvisories": resultList} feature = geojson.Feature(id=station, geometry=stationJson["geometry"], properties=properties) except IOError, e: if logger: logger.exception(e) except ValueError, e: if logger: logger.exception(e) except Exception, e: if logger: logger.exception(e) try: if feature is None: feature = geojson.Feature(id=station) json_data = {"status": {"http_code": 202}, "contents": feature} except Exception, e: if logger: logger.exception(e)