def main():
    raw_output = []
    html = u.load_url(c.HOLIDAYS_URL)
    if html:
        soup = BeautifulSoup(html, 'html.parser')
        cells = soup.findAll(['th', 'td'])
        row_len = int(math.ceil(len(cells) / 4))
        for ctr in xrange(0, row_len):
            # skip headers
            if ctr == 0:
                continue

            offset = ctr * 4
            holiday_type = cells[offset + 3].text.split()
            if holiday_type[0] in c.IGNORE_HOLIDAY_TYPES:
                raw_output_obj = {
                    'date': cells[offset + 0].text.strip(),
                    'day': cells[offset + 1].text.strip(),
                    'name': cells[offset + 2].text.strip(),
                    'type': holiday_type[0]
                }
                raw_output.append(raw_output_obj)
            else:
                print holiday_type

    output_fp = os.path.join(c.RESOURCE_DIRECTORY, c.RAW_OUTPUT_JSON)
    u.store_json(output_fp, raw_output)
Esempio n. 2
0
 def process(self):
     while (True):
         sub_sample_list = list_cut(self.web_seeds, self.threads_num)
         threads = []
         for i in range(0, self.threads_num):
             sub_samples = sub_sample_list[i]
             thread = threading.Thread(target=self.handle_task, args=(i, sub_samples))
             thread.start()
             threads.append(thread)
         [t.join() for t in threads]
         log_content = "main thread of instagram end. process web set num is %d" % (len(self.web_seeds))
         logging.info(log_content)
         if self.DEBUG:
             store_json(self.debug_json_file, self.test_json)
             print(self.debug_log_dict)
             break
         else:
             time.sleep(10 * 60) # 10分钟爬取并判断一次hub页
Esempio n. 3
0
def main():
    raw_json_fp = os.path.join(c.RESOURCE_DIRECTORY, c.RAW_OUTPUT_JSON)
    final_json_fp = os.path.join(c.RESOURCE_DIRECTORY, c.FINAL_OUTPUT_JSON)

    holiday_input = u.load_json(raw_json_fp)
    holiday_output = []

    for obj in holiday_input:
        if obj['day'] in c.ACCEPTABLE_DAYS:
            dt = datetime.datetime.strptime(obj['date'], '%b %d')
            output_obj = {
                'type': obj['type'],
                'month': dt.strftime('%B'),
                'date': int(dt.strftime('%d')),
                'day': obj['day'],
                'name': obj['name'],
                'number_of_days': 3
            }
            holiday_output.append(output_obj)

    u.store_json(final_json_fp, holiday_output)
Esempio n. 4
0
def main():
    try:
        sp.check_call(["ffmpeg", "-version"], **dict(sp_args,
                                                     stdout=sp.DEVNULL))
    except FileNotFoundError:
        print_quit("ffmpeg not found. Ensure it is present in PATH.")
    config = read_json(CONFIG_FILE, verbose=True)
    data = read_json(DATA_FILE) or {"urls": {}}
    args = parse_args(config, data["urls"])
    if not args.username or not args.password:
        print_quit("Email and password not provided.")
    if args.save_creds:
        config["creds"] = {
            "username": args.username,
            "password": args.password
        }
        store_json(config, CONFIG_FILE)

    course_lectures_url = get_course_url(args, data["urls"])
    token = login(args.username, args.password)

    headers = {"Authorization": "Bearer " + token}
    response = requests.get(course_lectures_url, headers=headers)
    if not response.ok:
        print_quit("Error fetching course info. Is the url proper?")

    lectures = response.json()
    total_lecs = len(lectures)
    subject_name = "{subjectName} {sessionName}".format(**lectures[0])
    working_dir: Path = args.dest / subject_name
    working_dir.mkdir(exist_ok=True, parents=True)
    print(f'Saving to "{working_dir}"')
    data["urls"][subject_name.upper()] = course_lectures_url
    store_json(data, DATA_FILE)

    lecture_ids = parse_lec_ranges(args.range, total_lecs)

    downloaded: dict = {
        int(file.stem[:2]): file
        for file in working_dir.glob("[0-9][0-9].*.mkv")
        if int(file.stem[:2]) in lecture_ids
    }
    if downloaded:
        if args.rename:
            rename_old(downloaded, lectures)
        if args.only_new:
            lecture_ids.difference_update(range(max(downloaded) + 1))
        elif args.force:
            print("Force option enabled. Deleting old lectures:",
                  *sorted(downloaded))
            for file in downloaded.values():
                file.unlink()
        else:
            print("Skipping already downloaded lectures:", *sorted(downloaded))
            lecture_ids.difference_update(downloaded)
    if not lecture_ids:
        print_quit("No lectures to download. Exiting.", 0)

    no_class = []
    task_args = []
    for lecture in reversed(lectures):  # Download lecture #1 first
        lec_no = lecture["seqNo"]

        if lec_no not in lecture_ids:
            continue

        file_name = make_filename(lecture)

        if not args.keep_no_class and "no class" in file_name.lower():
            no_class.append(lec_no)
            continue

        stream_url = IMP_BASE_URL + IMP_STREAM_URL.format(
            lecture["ttid"], token)
        task_args.append((
            token,
            stream_url,
            working_dir / file_name,
            args.quality,
            ANGLE_CHOICES.index(args.angle),
        ))

    if no_class:
        print("Skipping lectures with 'no class' in title:", *no_class)

    print("Downloading lecture numbers:",
          *sorted(lecture_ids.difference(no_class)))

    with DirServer(), ThreadPool(args.worker_processes) as pool:
        try:
            pool.starmap(download_stream, task_args)
            pool.close()
            pool.join()
        except KeyboardInterrupt:
            print_quit("Aborted.", 1)
    print("Finished!")
def process_candidate(row):
    try:

        candidate = row[1]

        profile = fetch_2018_candidate(
            id_=candidate['SQ_CANDIDATO'],
            state=candidate['SG_UF'],
        )

        store_json(
            'profile_{id_}_{partido}_{uf}_{time}'.format(
                id_=candidate['SQ_CANDIDATO'],
                partido=candidate['NR_PARTIDO'],
                uf=candidate['SG_UF'],
                time=current_time,
            ),
            profile,
        )

        expenses = fetch_2018_candidate_expenses(
            estado=candidate['SG_UF'],
            candidate=candidate['SQ_CANDIDATO'],
            urna=candidate['NR_CANDIDATO'],
            cargo=candidate['CD_CARGO'],
            partido=candidate['NR_PARTIDO'],
        )

        store_json(
            'expenses_{id_}_{partido}_{uf}_{time}'.format(
                id_=candidate['SQ_CANDIDATO'],
                partido=candidate['NR_PARTIDO'],
                uf=candidate['SG_UF'],
                time=current_time,
            ),
            expenses,
        )

        gender = 'F' if 'FEM.' == profile.get('descricaoSexo') else 'M'

        political_party, created = PoliticalParty.objects.get_or_create(
            number=profile.get('partido').get('numero'),
            defaults={
                'initials': profile.get('partido').get('sigla').upper(),
                'name': profile.get('partido').get('nome'),
            })

        job_role, created = JobRole.objects.get_or_create(
            name=profile.get('cargo').get('nome'),
            code=candidate['CD_CARGO'],
        )

        if not created and job_role.code is None:
            job_role.code = candidate['CD_CARGO']
            job_role.save()

        candidate_model, created = Candidate.objects.update_or_create(
            id_tse=profile.get('id'),
            defaults={
                'year': '2018',
                'gender': gender,
                'number': profile.get('numero'),
                'name': profile.get('nomeCompleto'),
                'name_ballot': profile.get('nomeUrna'),
                'job_role': job_role,
                'political_party': political_party,
                'coalition': profile.get('nomeColigacao'),
                'picture_url': profile.get('fotoUrl'),
                'budget_1t': profile.get('gastoCampanha1T'),
                'budget_2t': profile.get('gastoCampanha2T'),
                'birth_date': profile.get('dataDeNascimento'),
                'marital_status': profile.get('descricaoEstadoCivil'),
                'education': profile.get('grauInstrucao'),
                'job': profile.get('ocupacao'),
                'state': profile.get('sgUe') or candidate['SG_UF'],
                'property_value': profile.get('totalDeBens'),
                'email': candidate['NM_EMAIL'],
            })

        try:
            expenses = Expenses.objects.create(
                candidate=candidate_model,
                received=expenses['dadosConsolidados']['totalRecebido'],
                paid=expenses['despesas']['totalDespesasPagas'],
            )
            logger.debug('got expenses: {}'.format(
                profile.get('nomeCompleto')))
        except Exception:
            logger.debug('missing expenses: {}'.format(
                profile.get('nomeCompleto')))

        logger.debug('created: {}'.format(profile.get('nomeCompleto')))

    except Exception:
        logger.exception(f'creating candidate from endpoint')

        try:
            gender = 'F' if 4 == candidate['CD_GENERO'] else 'M'

            political_party, created = PoliticalParty.objects.get_or_create(
                number=candidate['NR_PARTIDO'],
                defaults={
                    'initials': candidate['SG_PARTIDO'],
                    'name': candidate['NM_PARTIDO'],
                })

            job_role, created = JobRole.objects.get_or_create(
                name=candidate['DS_CARGO'],
                code=candidate['CD_CARGO'],
            )

            birth = '-'.join(reversed(candidate['DT_NASCIMENTO'].split('/')))

            candidate_model, created = Candidate.objects.update_or_create(
                id_tse=candidate['SQ_CANDIDATO'],
                defaults={
                    'year': '2018',
                    'gender': gender,
                    'number': candidate['NR_CANDIDATO'],
                    'name': candidate['NM_CANDIDATO'],
                    'name_ballot': candidate['NM_URNA_CANDIDATO'],
                    'job_role': job_role,
                    'political_party': political_party,
                    'coalition': candidate['NM_COLIGACAO'],
                    'birth_date': birth,
                    'marital_status': candidate['DS_ESTADO_CIVIL'],
                    'education': candidate['DS_GRAU_INSTRUCAO'],
                    'job': candidate['DS_OCUPACAO'],
                    'state': candidate['SG_UF'],
                    'email': candidate['NM_EMAIL'],
                })

        except Exception:
            logger.exception('creating candidates from df')