def get_user_media(username): result = {} r = requests.get('https://www.instagram.com/' + username) data_search = re.search( '<script type="text/javascript">window._sharedData = (.*);</script>', r.text, re.IGNORECASE) if data_search: tmp = data_search.group(1) data = json.loads(tmp) try: user = data['entry_data']['ProfilePage'][0]['graphql']['user'] result['user_id'] = user['id'] result['user_username'] = user['username'] result['follower'] = user['edge_followed_by']['count'] result['follows'] = user['edge_follow']['count'] result['media_count'] = user['edge_owner_to_timeline_media'][ 'count'] result['media'] = [] result['media_ids'] = set() for post in user['edge_owner_to_timeline_media']['edges']: post = { 'id': post['node']['id'], 'timestamp': post['node']['taken_at_timestamp'], 'is_video': post['node']['is_video'], 'caption': post['node']['edge_media_to_caption']['edges'][0]['node'] ['text'] if post['node']['edge_media_to_caption']['edges'] else "Could not find caption", 'thumbnail': post['node']['thumbnail_src'], 'image': post['node']['display_url'] } result['media'].append(post) result['media_ids'].add(post['id']) except KeyError as exception: log.error( 'Unexpected response retrieving {} info: {!r}\n\nData: {}'. format(username, exception, data)) return InstagramUserFeed(result) log.info('Scraped ' + result['user_username'] + ' and ' + str(len(result['media'])) + ' posts') else: log.error('Failed to extract meta-information from HTML page') return InstagramUserFeed(result)
def select_police_stations(name: str, value: str) -> None: """ Seleciona uma delegacia. :param name: name do select. :param value: valor a ser selecionado. """ try: Select(driver.find_element_by_name(name)).select_by_value(str(value)) except NoSuchElementException as e: error(e.__str__()) raise NoSuchElementException( f'Não foi possível encontrar a opção com o valor {value}')
def select_option(name: str, value: str) -> None: """ Seleciona uma opção do select option, pelo seu valor. :param name: name do select. :param value: valor a ser selecionado. """ to_select: str = get_values(name)[value] try: Select(driver.find_element_by_name(name)).select_by_value( str(to_select)) except NoSuchElementException as e: error(e.__str__()) raise NoSuchElementException( f'Não foi possível encontrar a opção com o valor {to_select}')
def _find_download_url(self, ep_page_html): download_url_pattern = 'googleusercontent' soup = BeautifulSoup(ep_page_html, SOUP_PARSER_HTML) download_link = None for link in soup.find_all('a'): ref = link.get('href') if download_url_pattern in str(ref): if download_link is not None: error('more than one download link found; {}, {}'.format( download_link, ref)) download_link = ref if download_link is None: raise RuntimeError('no download link found') return download_link
def __database(cls, db_name: Optional[str] = 'scraping') -> Database: """ Retorna o banco de dados padrão, caso o nenhum valor seja passado em 'db_name'.\n Senão, um novo banco de dados será criado e retornado.\n :param db_name: nome do banco de dados a ser criado, ou retornado. :raise OperationFailure: falha ao criar o banco de dados. :return: banco de dados criado. """ if db_name not in cls.__CONN.list_database_names(): try: db: Database = cls.__CONN[db_name] info(f'Database {db_name} criado.') return db except OperationFailure as e: error(f'Erro ao criar o database: {e.__str__()}') return cls.__CONN.get_database(db_name)
def generate_report(domaine: str): template_name = "template_{}.html".format(domaine) template_path = os.path.join(os.getcwd(), TEMPLATES_FOLDER, template_name) report_name = "rapport_{}.html".format(domaine) report_path = os.path.join(os.getcwd(), OUTPUT_FOLDER, report_name) if domaine not in DOMAINES: log.error("Domaine {} inconnu".format(domaine)) else: log.info("Génération du rapport {}...".format(report_name)) if domaine == "client": datas = DonneesClient().tags elif domaine == "paiement": datas = DonneesPaiement().tags # Jinja2 DefaultTemplater(template_path, report_path).render(datas) log.info("Rapport {} généré !".format(report_name))
def read_parameters(): try: opts, args = getopt.getopt(sys.argv[1:], "hd:", ["help", "domaine"]) except getopt.GetoptError as err: log.error(str(err)) sys.exit(2) parameters = Parameters() for opt, arg in opts: if opt in ("-h", "--help"): log.usage() sys.exit() elif opt in ("-d", "--domaine"): parameters.domaine = arg else: assert False, "Option non prise en compte" check_parameters(parameters.domaine) return parameters
def __get_records(self) -> dict: """ Obtém os registros de determinado crime que esteja presente na tabela de ocorrências, trantando os dados \n e montando um dict com os valores obtidos. \n :raise ValueError: caso o crime passado não seja encontrado. :return: dict contendo os registros de cada mês e o total. """ table_header: list = extract_table_value(self.__id_table, 'th') table_datas: list = extract_table_value(self.__id_table, 'td') try: key_word: int = table_datas.index(self.__crime) except ValueError as e: error( f'Erro ao obter os dados da região {self.__region}.\n Detalhes: {e.__str__()}' ) raise ValueError( f'O crime {self.__crime} não está presente na tabela.') else: records: list = table_datas[key_word:key_word + len(table_header)] keys: list = list( map(lambda to_lower: to_lower.lower(), filter(lambda value: value != 'Natureza', table_header))) records.pop(0) values: list = list( map(lambda value: float(value.replace('...', '0')), records)) info(f'Registros da região {self.__region} obtidos.') return dict(zip(keys, values))
def collection( self, is_current_occurrences: Optional[bool] = False) -> Collection: """ Retorna a collection especificada, caso exista.\n Senão, uma nova será criada e retornada.\n >>> 'current_occurrences' if is_current_occurrences else 'last_occurrences' :param is_current_occurrences: boolean para identificar qual collection será criada, ou retornada. :return: collection criada ou selecionada. """ db: Database = self.__database() coll_name: str = 'current_occurrences' if is_current_occurrences else 'last_occurrences' if coll_name not in db.list_collection_names(): try: db.create_collection(coll_name) info(f'Collection {coll_name} criada.') except OperationFailure as e: error(f'Erro ao criar a collection: {e.__str__()}') return db.get_collection(coll_name)
def check_parameters(domaine): # Paramètres obligatoires sinon on sort if not domaine: log.error("Le domaine du rapport à générer est obligatoire") log.usage() sys.exit()
def __init__(self): from src import log self.DEBUG: bool = strtobool(os.environ.get("DEBUG", False)) self.use_cache: bool = strtobool(os.environ.get("use_cache", True)) try: self.image_size: int = int(os.environ.get("image_size", 256)) except ValueError as e: log.error("Env variable image_size must be set to an integer") raise e except Exception as e: log.error("Uknown error when loading image_size from environment") raise e self.include_healthy_annotations: bool = strtobool( os.environ.get("include_healthy_annotations", False)) self.include_records_without_annotations: bool = strtobool( os.environ.get("include_records_without_annotations", False)) self.batch_size: int = int(os.environ.get("batch_size", 16)) self.artificial_batch_size: int = int( os.environ.get("artificial_batch_size", 256)) if self.batch_size > self.artificial_batch_size: log.warn( f"Artificial batch size was smaller than batch size, this is not possible ({self.batch_size} > {self.artificial_batch_size}), artificial batch size set to batch size" ) self.artificial_batch_size = self.batch_size self.gpu_count: int = int(os.environ.get("GPU_COUNT", 0)) self.one_gpu_for_validation: bool = strtobool( os.environ.get("HOLD_ONE_GPU_FOR_VALIDATION", False)) self.use_gpu: bool = self.gpu_count > 0 and torch.cuda.is_available() if self.gpu_count > 0 and not self.use_gpu: log.error( "Attempted to utilize a GPU but no GPU or CUDA Driver was found. Defaulting to CPU" ) self.gpu_count = 0 if self.use_gpu and self.gpu_count > torch.cuda.device_count(): log.warn( f"Attempted to utilize more GPUs than allowed, setting gpu count to {torch.cuda.device_count()}" ) self.gpu_count = torch.cuda.device_count() self.devices: List[torch.device] self.validation_device: torch.device if self.use_gpu: self.devices = [ torch.device(f'cuda:{x}') for x in range(self.gpu_count) ] else: self.devices = [torch.device('cpu')] if self.use_gpu and self.gpu_count > 1 and self.one_gpu_for_validation: self.validation_device = self.devices[-1] self.devices = self.devices[:-1] self.gpu_count -= 1 elif self.one_gpu_for_validation and self.gpu_count <= 1: log.warn( "Attempted to hold off one GPU for validation, but only 1 gpu was found, defaulting validation device to base device" ) self.validation_device = self.devices[0] else: self.validation_device = self.devices[0] self.distribute_across_gpus: bool = strtobool( os.environ.get("distribute_across_gpus", False)) if self.use_gpu: torch.cuda.set_device(0)