def main(): raw_output = [] html = u.load_url(c.HOLIDAYS_URL) if html: soup = BeautifulSoup(html, 'html.parser') cells = soup.findAll(['th', 'td']) row_len = int(math.ceil(len(cells) / 4)) for ctr in xrange(0, row_len): # skip headers if ctr == 0: continue offset = ctr * 4 holiday_type = cells[offset + 3].text.split() if holiday_type[0] in c.IGNORE_HOLIDAY_TYPES: raw_output_obj = { 'date': cells[offset + 0].text.strip(), 'day': cells[offset + 1].text.strip(), 'name': cells[offset + 2].text.strip(), 'type': holiday_type[0] } raw_output.append(raw_output_obj) else: print holiday_type output_fp = os.path.join(c.RESOURCE_DIRECTORY, c.RAW_OUTPUT_JSON) u.store_json(output_fp, raw_output)
def process(self): while (True): sub_sample_list = list_cut(self.web_seeds, self.threads_num) threads = [] for i in range(0, self.threads_num): sub_samples = sub_sample_list[i] thread = threading.Thread(target=self.handle_task, args=(i, sub_samples)) thread.start() threads.append(thread) [t.join() for t in threads] log_content = "main thread of instagram end. process web set num is %d" % (len(self.web_seeds)) logging.info(log_content) if self.DEBUG: store_json(self.debug_json_file, self.test_json) print(self.debug_log_dict) break else: time.sleep(10 * 60) # 10分钟爬取并判断一次hub页
def main(): raw_json_fp = os.path.join(c.RESOURCE_DIRECTORY, c.RAW_OUTPUT_JSON) final_json_fp = os.path.join(c.RESOURCE_DIRECTORY, c.FINAL_OUTPUT_JSON) holiday_input = u.load_json(raw_json_fp) holiday_output = [] for obj in holiday_input: if obj['day'] in c.ACCEPTABLE_DAYS: dt = datetime.datetime.strptime(obj['date'], '%b %d') output_obj = { 'type': obj['type'], 'month': dt.strftime('%B'), 'date': int(dt.strftime('%d')), 'day': obj['day'], 'name': obj['name'], 'number_of_days': 3 } holiday_output.append(output_obj) u.store_json(final_json_fp, holiday_output)
def main(): try: sp.check_call(["ffmpeg", "-version"], **dict(sp_args, stdout=sp.DEVNULL)) except FileNotFoundError: print_quit("ffmpeg not found. Ensure it is present in PATH.") config = read_json(CONFIG_FILE, verbose=True) data = read_json(DATA_FILE) or {"urls": {}} args = parse_args(config, data["urls"]) if not args.username or not args.password: print_quit("Email and password not provided.") if args.save_creds: config["creds"] = { "username": args.username, "password": args.password } store_json(config, CONFIG_FILE) course_lectures_url = get_course_url(args, data["urls"]) token = login(args.username, args.password) headers = {"Authorization": "Bearer " + token} response = requests.get(course_lectures_url, headers=headers) if not response.ok: print_quit("Error fetching course info. Is the url proper?") lectures = response.json() total_lecs = len(lectures) subject_name = "{subjectName} {sessionName}".format(**lectures[0]) working_dir: Path = args.dest / subject_name working_dir.mkdir(exist_ok=True, parents=True) print(f'Saving to "{working_dir}"') data["urls"][subject_name.upper()] = course_lectures_url store_json(data, DATA_FILE) lecture_ids = parse_lec_ranges(args.range, total_lecs) downloaded: dict = { int(file.stem[:2]): file for file in working_dir.glob("[0-9][0-9].*.mkv") if int(file.stem[:2]) in lecture_ids } if downloaded: if args.rename: rename_old(downloaded, lectures) if args.only_new: lecture_ids.difference_update(range(max(downloaded) + 1)) elif args.force: print("Force option enabled. Deleting old lectures:", *sorted(downloaded)) for file in downloaded.values(): file.unlink() else: print("Skipping already downloaded lectures:", *sorted(downloaded)) lecture_ids.difference_update(downloaded) if not lecture_ids: print_quit("No lectures to download. Exiting.", 0) no_class = [] task_args = [] for lecture in reversed(lectures): # Download lecture #1 first lec_no = lecture["seqNo"] if lec_no not in lecture_ids: continue file_name = make_filename(lecture) if not args.keep_no_class and "no class" in file_name.lower(): no_class.append(lec_no) continue stream_url = IMP_BASE_URL + IMP_STREAM_URL.format( lecture["ttid"], token) task_args.append(( token, stream_url, working_dir / file_name, args.quality, ANGLE_CHOICES.index(args.angle), )) if no_class: print("Skipping lectures with 'no class' in title:", *no_class) print("Downloading lecture numbers:", *sorted(lecture_ids.difference(no_class))) with DirServer(), ThreadPool(args.worker_processes) as pool: try: pool.starmap(download_stream, task_args) pool.close() pool.join() except KeyboardInterrupt: print_quit("Aborted.", 1) print("Finished!")
def process_candidate(row): try: candidate = row[1] profile = fetch_2018_candidate( id_=candidate['SQ_CANDIDATO'], state=candidate['SG_UF'], ) store_json( 'profile_{id_}_{partido}_{uf}_{time}'.format( id_=candidate['SQ_CANDIDATO'], partido=candidate['NR_PARTIDO'], uf=candidate['SG_UF'], time=current_time, ), profile, ) expenses = fetch_2018_candidate_expenses( estado=candidate['SG_UF'], candidate=candidate['SQ_CANDIDATO'], urna=candidate['NR_CANDIDATO'], cargo=candidate['CD_CARGO'], partido=candidate['NR_PARTIDO'], ) store_json( 'expenses_{id_}_{partido}_{uf}_{time}'.format( id_=candidate['SQ_CANDIDATO'], partido=candidate['NR_PARTIDO'], uf=candidate['SG_UF'], time=current_time, ), expenses, ) gender = 'F' if 'FEM.' == profile.get('descricaoSexo') else 'M' political_party, created = PoliticalParty.objects.get_or_create( number=profile.get('partido').get('numero'), defaults={ 'initials': profile.get('partido').get('sigla').upper(), 'name': profile.get('partido').get('nome'), }) job_role, created = JobRole.objects.get_or_create( name=profile.get('cargo').get('nome'), code=candidate['CD_CARGO'], ) if not created and job_role.code is None: job_role.code = candidate['CD_CARGO'] job_role.save() candidate_model, created = Candidate.objects.update_or_create( id_tse=profile.get('id'), defaults={ 'year': '2018', 'gender': gender, 'number': profile.get('numero'), 'name': profile.get('nomeCompleto'), 'name_ballot': profile.get('nomeUrna'), 'job_role': job_role, 'political_party': political_party, 'coalition': profile.get('nomeColigacao'), 'picture_url': profile.get('fotoUrl'), 'budget_1t': profile.get('gastoCampanha1T'), 'budget_2t': profile.get('gastoCampanha2T'), 'birth_date': profile.get('dataDeNascimento'), 'marital_status': profile.get('descricaoEstadoCivil'), 'education': profile.get('grauInstrucao'), 'job': profile.get('ocupacao'), 'state': profile.get('sgUe') or candidate['SG_UF'], 'property_value': profile.get('totalDeBens'), 'email': candidate['NM_EMAIL'], }) try: expenses = Expenses.objects.create( candidate=candidate_model, received=expenses['dadosConsolidados']['totalRecebido'], paid=expenses['despesas']['totalDespesasPagas'], ) logger.debug('got expenses: {}'.format( profile.get('nomeCompleto'))) except Exception: logger.debug('missing expenses: {}'.format( profile.get('nomeCompleto'))) logger.debug('created: {}'.format(profile.get('nomeCompleto'))) except Exception: logger.exception(f'creating candidate from endpoint') try: gender = 'F' if 4 == candidate['CD_GENERO'] else 'M' political_party, created = PoliticalParty.objects.get_or_create( number=candidate['NR_PARTIDO'], defaults={ 'initials': candidate['SG_PARTIDO'], 'name': candidate['NM_PARTIDO'], }) job_role, created = JobRole.objects.get_or_create( name=candidate['DS_CARGO'], code=candidate['CD_CARGO'], ) birth = '-'.join(reversed(candidate['DT_NASCIMENTO'].split('/'))) candidate_model, created = Candidate.objects.update_or_create( id_tse=candidate['SQ_CANDIDATO'], defaults={ 'year': '2018', 'gender': gender, 'number': candidate['NR_CANDIDATO'], 'name': candidate['NM_CANDIDATO'], 'name_ballot': candidate['NM_URNA_CANDIDATO'], 'job_role': job_role, 'political_party': political_party, 'coalition': candidate['NM_COLIGACAO'], 'birth_date': birth, 'marital_status': candidate['DS_ESTADO_CIVIL'], 'education': candidate['DS_GRAU_INSTRUCAO'], 'job': candidate['DS_OCUPACAO'], 'state': candidate['SG_UF'], 'email': candidate['NM_EMAIL'], }) except Exception: logger.exception('creating candidates from df')