def parse_json_checkin(json, url=None): """Return salient info about a Foursquare checkin `json` that can be either JSON text or already parsed as a dictionary.""" if not json: return None if not isinstance(json, dict): try: checkin = ujson.loads(json) except (TypeError, ValueError) as not_json: print(not_json, json, url) return None else: checkin = json['checkin'] uid = u.get_nested(checkin, ['user', 'id']) vid = u.get_nested(checkin, ['venue', 'id']) time = u.get_nested(checkin, 'createdAt') offset = u.get_nested(checkin, 'timeZoneOffset', 0) if None in [uid, vid, time]: return None time = datetime.fromtimestamp(time, tz=pytz.utc) # by doing this, the date is no more UTC. So why not put the correct # timezone? Because in that case, pymongo will convert to UTC at # insertion. Yet I want local time, but without doing the conversion # when the result comes back from the DB. time += timedelta(minutes=offset) return int(uid), str(vid), time
def parse_tweet(tweet): """Return a CheckIn from `tweet` or None if it is not located in a valid city""" loc = u.get_nested(tweet, 'coordinates') city = None if not loc: # In that case, we would have to follow the link to know whether the # checkin falls within our cities but that's too costly so we drop it # (and introduce a bias toward open sharing users I guess) return None lon, lat = loc['coordinates'] city = find_town(lat, lon, CITIES_TREE) if not (city and city in cities.SHORT_KEY): return None tid = u.get_nested(tweet, 'id_str') urls = u.get_nested(tweet, ['entities', 'urls'], []) # short url of the checkin that need to be expand, either using bitly API # or by VenueIdCrawler. Once we get the full URL, we still need to request # 4SQ (500 per hours) to get info. is_foursquare_url = lambda u: '4sq.com' in u or 'swarmapp.com' in u fsq_urls = [url['expanded_url'] for url in urls if is_foursquare_url(url['expanded_url'])] if not fsq_urls: return None lid = str(fsq_urls[0]) uid = u.get_nested(tweet, ['user', 'id_str']) msg = u.get_nested(tweet, 'text') try: time = datetime.strptime(tweet['created_at'], UTC_DATE) time = cities.utc_to_local(city, time) except ValueError: print('time: {}'.format(tweet['created_at'])) return None return FullCheckIn('', lid, '', city, loc, time, tid, uid, msg)
def parse_json_checkin(json, url=None): """Return salient info about a Foursquare checkin `json` that can be either JSON text or already parsed as a dictionary.""" #print 'twitter_helper.py/parse_json_checkin' if not json: return None if not isinstance(json, dict): try: checkin = ujson.loads(json) except (TypeError, ValueError) as not_json: print(not_json, json, url) return None else: checkin = json['checkin'] uid = u.get_nested(checkin, ['user', 'id']) vid = u.get_nested(checkin, ['venue', 'id']) time = u.get_nested(checkin, 'createdAt') offset = u.get_nested(checkin, 'timeZoneOffset', 0) if None in [uid, vid, time]: return None time = datetime.fromtimestamp(time, tz=pytz.utc) # by doing this, the date is no more UTC. So why not put the correct # timezone? Because in that case, pymongo will convert to UTC at # insertion. Yet I want local time, but without doing the conversion # when the result comes back from the DB. time += timedelta(minutes=offset) return int(uid), str(vid), time
def job() -> None: data: List[Base] = [] responses: List[Tuple] = fetch_yahoo_responses() for response in responses: payload: Dict = response[0] isin: str = response[1] income_statement_response: List = get_nested( payload, 'incomeStatementHistory', 'incomeStatementHistory', default=[]) data = data + traverse_statement_history( Model=IncomeStatement, # type: ignore isin=isin, statements=income_statement_response) cash_flow_statement_response: List = get_nested( payload, 'cashflowStatementHistory', 'cashflowStatements', default=[]) data = data + traverse_statement_history( Model=CashFlowStatement, # type: ignore isin=isin, statements=cash_flow_statement_response) balance_sheet_statement_response: List = get_nested( payload, 'balanceSheetHistory', 'balanceSheetStatements', default=[]) data = data + traverse_statement_history( Model=BalanceSheetStatement, # type: ignore isin=isin, statements=balance_sheet_statement_response) ETLBase.load_data(data)
async def task(i): aweme_file = open(f"aweme_meta/{i}.txt", "a") while not STOP_FLAG: item = redis_client.lpop("tiktok:aweme") if item is None: await asyncio.sleep(5) continue [key, aweme] = item.decode("utf-8").split("$", 1) get_meta_file(meta_file_dir).write(f"{key}\t{aweme}\n") aweme = json.loads(aweme) file_name = aweme.get("aweme_id") + ".mp4" download_urls = get_nested(aweme, ["video", "download_addr", "url_list"]) if not download_urls: continue try: for url in download_urls: try: await download_video(url, file_name) break except Exception as e: logger.warning(e) continue await upload_video(file_name) except Exception as e: logger.warning(e)
def leaf_criterium(keys, node): try: parser_type = get_nested(parser_tree, keys).type except (AttributeError, KeyError): parser_type = None if isinstance(node, dict) and parser_type != "dict": return False return True
def push_images(self): """Push images to registries""" if not self._push_context: log.debug( "--generate images is not specified. Generate push context...") for image_tag, _ in self.build_tags(): self._push_context[image_tag] = docker_client.images.get( image_tag) for registry, registry_spec in self.repository.items(): # We will want to push README first. login_payload: Dict = { "username": os.getenv(registry_spec['user']), "password": os.getenv(registry_spec['pwd']), } api_url: str = get_nested(registry_spec, ['urls', 'api']) for package, registry_url in registry_spec['registry'].items(): _, _url = registry_url.split('/', maxsplit=1) readme_path = Path('docs', package, 'README.md') repo_url: str = f"{get_nested(registry_spec, ['urls', 'repos'])}/{_url}/" self.push_readmes(api_url, repo_url, readme_path, login_payload) if FLAGS.readmes: log.info("--readmes is specified. Exit after pushing readmes.") return # Then push image to registry. for image_tag in self.push_tags(): image = self._push_context[image_tag] reg, tag = image_tag.split(":") registry = ''.join([ v for k, v in registry_spec['registry'].items() if reg in k ]) log.info(f"Uploading {image_tag} to {registry}") # NOTES: about concurrent pushing # This would change most of our build logics # since DockerClient is essentially a requests.Session, # which doesn't have support for asynchronous requests. # If we want to implement aiohttp then we might want to # run docker from shell commands. self.background_upload(image, tag, registry) # separate release latest tags for yatai-service if all( map(image_tag.__contains__, ['yatai-service', '3.8', 'slim'])): log.info(f"Uploading {image_tag} as latest to {registry}") tag = 'latest' self.background_upload(image, tag, registry)
def parse_tweet(tweet): """Return a CheckIn from `tweet` or None if it is not located in a valid city""" #print 'twitter_helper.py/parse_tweet' loc = u.get_nested(tweet, 'coordinates') city = None if not loc: # In that case, we would have to follow the link to know whether the # checkin falls within our cities but that's too costly so we drop it # (and introduce a bias toward open sharing users I guess) return None lon, lat = loc['coordinates'] city = find_town(lat, lon, CITIES_TREE) #print 'city', city if not (city and city in cities.SHORT_KEY): return None #print 'tree', CITIES_TREE tid = u.get_nested(tweet, 'id_str') urls = u.get_nested(tweet, ['entities', 'urls'], []) # short url of the checkin that need to be expand, either using bitly API # or by VenueIdCrawler. Once we get the full URL, we still need to request # 4SQ (500 per hours) to get info. is_foursquare_url = lambda u: '4sq.com' in u or 'swarmapp.com' in u fsq_urls = [ url['expanded_url'] for url in urls if is_foursquare_url(url['expanded_url']) ] if not fsq_urls: return None lid = str(fsq_urls[0]) uid = u.get_nested(tweet, ['user', 'id_str']) msg = u.get_nested(tweet, 'text') try: time = datetime.strptime(tweet['created_at'], UTC_DATE) time = cities.utc_to_local(city, time) except ValueError: print('time: {}'.format(tweet['created_at'])) return None return FullCheckIn('', lid, '', city, loc, time, tid, uid, msg)
def push_images(self) -> None: """Push images to registries""" if not self._push_context: log.debug( "--generate images is not specified. Generate push context...") for image_tag, _ in self.build_tags( )[1]: # get non base image tags self._push_context[image_tag] = docker_client.images.get( image_tag) for registry, registry_spec in self.repository.items(): # We will want to push README first. login_payload: t.Dict = { "username": os.getenv(registry_spec["user"]), "password": os.getenv(registry_spec["pwd"]), } api_url: str = get_nested(registry_spec, ["urls", "api"]) for package, registry_url in registry_spec["registry"].items(): _, _url = registry_url.split("/", maxsplit=1) readme_path = Path("generated", package, "README.md") repo_url: str = ( f"{get_nested(registry_spec, ['urls', 'repos'])}/{_url}/") self.push_readmes(api_url, repo_url, readme_path, login_payload) if FLAGS.readmes: log.info("--readmes is specified. Exit after pushing readmes.") return # Then push image to registry. with ThreadPoolExecutor(max_workers=5) as executor: for image_tag in self.push_tags(): image = self._push_context[image_tag] reg, tag = image_tag.split(":") registry = "".join([ v for k, v in registry_spec["registry"].items() if reg in k ]) log.info(f"Uploading {image_tag} to {registry}") future = executor.submit(self.background_upload, image, tag, registry) log.info(future.result)
def get_count(obj, field): """If available, return how many item of type 'field' are in 'obj'""" return get_nested(obj, [field, 'count'], 0)
def get_loc(vid): """Return coordinated of the venue `vid` (or None if it's not in DB).""" res = DB.venue.find_one({'_id': vid}, {'loc': 1}) if res: return u.get_nested(res, ['loc', 'coordinates']) return None
def process_response(cls, response: Dict, isin: str) -> Base: record = { 'isin': isin, 'report_date': datetime.fromtimestamp(get_nested(response, 'endDate', 'raw')).date(), 'total_revenue': get_nested(response, 'totalRevenue', 'raw'), 'cost_of_revenue': get_nested(response, 'costOfRevenue', 'raw'), 'gross_profit': get_nested(response, 'grossProfit', 'raw'), 'research_development': get_nested(response, 'researchDevelopment', 'raw'), 'selling_general_administrative': get_nested(response, 'sellingGeneralAdministrative', 'raw'), 'non_recurring': get_nested(response, 'nonRecurring', 'raw'), 'other_operating_expenses': get_nested(response, 'otherOperatingExpenses', 'raw'), 'total_operating_expenses': get_nested(response, 'totalOperatingExpenses', 'raw'), 'operating_income': get_nested(response, 'operatingIncome', 'raw'), 'total_other_income_expense_net': get_nested(response, 'totalOtherIncomeExpenseNet', 'raw'), 'ebit': get_nested(response, 'ebit', 'raw'), 'interest_expense': get_nested(response, 'interestExpense', 'raw'), 'income_before_tax': get_nested(response, 'incomeBeforeTax', 'raw'), 'income_tax_expense': get_nested(response, 'incomeTaxExpense', 'raw'), 'minority_interest': get_nested(response, 'minorityInterest', 'raw'), 'net_income_from_continuing_ops': get_nested(response, 'netIncomeFromContinuingOps', 'raw'), 'discontinued_operations': get_nested(response, 'discontinuedOperations', 'raw'), 'extraordinary_items': get_nested(response, 'extraordinaryItems', 'raw'), 'effect_of_accounting_charges': get_nested(response, 'effectOfAccountingCharges', 'raw'), 'other_items': get_nested(response, 'otherItems', 'raw'), 'net_income': get_nested(response, 'netIncome', 'raw'), 'net_income_applicable_to_common_shares': get_nested(response, 'netIncomeApplicableToCommonShares', 'raw') } result: Base = cls(**record) return result
def get_count(obj, field): """If available, return how many item of type 'field' are in 'obj'""" return get_nested(obj, [field, "count"], 0)
def process_response(cls, response: Dict, isin: str) -> Base: record = { 'isin': isin, 'report_date': datetime.fromtimestamp(get_nested(response, 'endDate', 'raw')).date(), 'cash': get_nested(response, 'cash', 'raw'), 'short_term_investments': get_nested(response, 'shortTermInvestments', 'raw'), 'net_receivables': get_nested(response, 'netReceivables', 'raw'), 'total_current_assets': get_nested(response, 'totalCurrentAssets', 'raw'), 'property_plant_equipment': get_nested(response, 'propertyPlantEquipment', 'raw'), 'intangible_assets': get_nested(response, 'intangibleAssets', 'raw'), 'other_assets': get_nested(response, 'otherAssets', 'raw'), 'deferred_long_term_asset_charges': get_nested(response, 'deferredLongTermAssetCharges', 'raw'), 'total_assets': get_nested(response, 'totalAssets', 'raw'), 'accounts_payable': get_nested(response, 'accountsPayable', 'raw'), 'short_long_term_debt': get_nested(response, 'shortLongTermDebt', 'raw'), 'other_current_liab': get_nested(response, 'otherCurrentLiab', 'raw'), 'long_term_debt': get_nested(response, 'longTermDebt', 'raw'), 'other_liab': get_nested(response, 'otherLiab', 'raw'), 'deferred_long_term_liab': get_nested(response, 'deferredLongTermLiab', 'raw'), 'total_current_liabilities': get_nested(response, 'totalCurrentLiabilities', 'raw'), 'total_liab': get_nested(response, 'totalLiab', 'raw'), 'common_stock': get_nested(response, 'commonStock', 'raw'), 'retained_earnings': get_nested(response, 'retainedEarnings', 'raw'), 'treasury_stock': get_nested(response, 'treasuryStock', 'raw'), 'other_stockholder_equity': get_nested(response, 'otherStockholderEquity', 'raw'), 'total_stockholder_equity': get_nested(response, 'totalStockholderEquity', 'raw'), 'net_tangible_assets': get_nested(response, 'netTangibleAssets', 'raw') } result: Base = cls(**record) return result
def process_response(cls, response: Dict, isin: str) -> Base: record = { 'isin': isin, 'report_date': datetime.fromtimestamp(get_nested(response, 'endDate', 'raw')).date(), 'net_income': get_nested(response, 'netIncome', 'raw'), 'change_to_netincome': get_nested(response, 'changeToNetincome', 'raw'), 'change_to_account_receivables': get_nested(response, 'changeToAccountReceivables', 'raw'), 'change_to_liabilities': get_nested(response, 'changeToLiabilities', 'raw'), 'total_cash_from_operating_activities': get_nested(response, 'totalCashFromOperatingActivities', 'raw'), 'capital_expenditures': get_nested(response, 'capitalExpenditures', 'raw'), 'other_cashflows_from_investing_activities': get_nested(response, 'otherCashflowsFromInvestingActivities', 'raw'), 'total_cashflows_from_investing_activities': get_nested(response, 'totalCashflowsFromInvestingActivities', 'raw'), 'dividends_paid': get_nested(response, 'dividendsPaid', 'raw'), 'net_borrowings': get_nested(response, 'netBorrowings', 'raw'), 'other_cashflows_from_financing_activities': get_nested(response, 'otherCashflowsFromFinancingActivities', 'raw'), 'total_cash_from_financing_activities': get_nested(response, 'totalCashFromFinancingActivities', 'raw'), 'effect_of_exchange_rate': get_nested(response, 'effectOfExchangeRate', 'raw'), 'change_in_cash': get_nested(response, 'changeInCash', 'raw'), 'repurchase_of_stock': get_nested(response, 'repurchaseOfStock', 'raw'), 'issuance_of_stock': get_nested(response, 'issuanceOfStock', 'raw') } result: Base = cls(**record) return result