def convert_data_to_table_format(): logger.info("transform") storage = FileStorage(SCRAPPED_FILE) # transform gathered data from json file to pandas DataFrame and save as csv parser = Parser(storage) parser.parse(TABLE_FORMAT_FILE)
def start(): if validate_properties(config.get_properties()): logger.info('Starting application') parser = Parser(config) parser.run() else: logger.info('Something went wrong')
def get_metadata_of_incoming_statements(incoming_dir: Path, ) -> list[IncomingStatement]: incoming_statements = [] for bankpath in sorted(incoming_dir.iterdir()): if not bankpath.is_dir(): continue bank = bankpath.name bank_parsers = parsers.get(bank) if bank_parsers is None: print('unknown bank:', bank, file=sys.stderr) continue filenames = sorted(bankpath.iterdir()) if filenames: print('importing bank statements from', bank) for src_file in filenames: try: extension = src_file.suffix.lower() Parser = bank_parsers[extension] except KeyError: continue parser = Parser(src_file) m = parser.parse_metadata() print(f'{m.start_date} → {m.end_date}: {src_file}') incoming_statements.append(IncomingStatement( statement_path=src_file, parser=parser, metadata=m, )) return incoming_statements
def parse(self, data): soup = BeautifulSoup(data, 'html.parser') obj = soup.find('span', {'class': "header-profile-login"}) if not obj: raise Parser.IncorrectFormat(data) name = obj.text.strip() object_list = soup.find_all('a') if not object_list: raise Parser.IncorrectFormat(data) num_books = 0 for obj in object_list: m = re.fullmatch(r"Книги\s*(\d+)\s*", obj.text) if m: num_books = int(m.group(1)) break object_list = soup.find_all('div', {'class': "group-row-title"}) if not object_list: raise Parser.IncorrectFormat(data) birth = None death = None for obj in object_list: if not birth: m = re.fullmatch(r"(?:Родился|Родилась):\s*(.*)", obj.text) if m: birth = m.group(1) if not death: m = re.fullmatch(r"(?:Умер|Умерла):\s*(.*)", obj.text) if m: death = m.group(1) if birth and death: break birth_date, birth_place = self.parseDate(birth) birth_place = re.sub(r'\s+', ' ', birth_place).strip() death_date, death_place = self.parseDate(death) death_place = re.sub(r'\s+', ' ', death_place).strip() obj = soup.find('span', { 'class': "stats-item marg-right", 'title': 'Почитатели творчества' }) adepts = int(obj.text if obj is not None else 0) obj = soup.find('span', { 'class': "stats-item marg-right", 'title': 'Читателей' }) readers = int(obj.text if obj is not None else 0) return [ name, birth_date, birth_place, death_date, death_place, num_books, adepts, readers]
class Scraper(object): def __init__(self): self.__load_config() self.parser = Parser() self.csv_maker = CsvMaker() self.page_number = 1 def scrape(self): data = [] for page_no in range(1, self.config[PAGES]): self.page_number = page_no data = data + self.scrape_page() self.csv_maker.make(data) def scrape_page(self): print "Scraping Page No: {}".format(self.page_number) resp = requests.get(self.__url_endpoint(), self.__query_dict()) soup = BeautifulSoup(resp.text, 'html.parser') table = soup.findAll(True, {'class': ['row0', 'row1']}) return self.parser.parse(table) def __load_config(self): self.config = yaml.safe_load(open('config.yaml')) def __url_endpoint(self): return self.config[BASE_URL] def __query_dict(self): query_dict = {'pp': PER_PAGE_DATA, 'p': self.page_number} if self.config.has_key(COURSE): query_dict['q'] = self.config[COURSE] print query_dict return query_dict
def parse(city, country="France"): tmp = toParse.get(country) if not tmp: print(f"No such country as {country} is supported", file=sys.stderr) url = tmp.get(city) if not url: print(f"No such city as {city} is supported", file=sys.stderr) return Parser.parse(url, city, country)
def main(): filename = process_arguments() with open(filename) as filebuffer: try: lex = Lexer(filebuffer) parser = Parser(lex) parser.P() except EndOfFileError: print "Syntax error at line " + str(lex.line) except CompilerSyntaxError as e: print e except CompilerLexError as e: print e
def test_parse(self): instance = None try: instance = Parser(files_folder="..\\wrong_path\\to\\files_folder") except SystemExit: pass finally: self.assertEqual(instance, None)
def test_parsing_return_statement(): source_code = """ return a; """ lexer = Lexer(source_code) program = Parser(lexer).exec_program() assert program.statements[0]._token_iteral == "Return with a"
def test_parsing_infix_expression(): source_code = """ 1-2+3; """ lexer = Lexer(source_code) program = Parser(lexer).exec_program() print("===========\n") print(program.statements[0]._token_iteral) print("===========\n")
def test_parsing_let_statement(): source_code = """ let a = 122; let b = 1; """ lexer = Lexer(source_code) program = Parser(lexer).exec_program() [print(x) for x in lexer.tokens] # [print(x._token_iteral) for x in program.statements] print(program.statements[0].token_iteral) assert len(program._statements) == 2 assert program.statements[ 0].token_iteral == "This is a Let statement, left is an identifer: a, right size is value of 122"
def parse(self, data): """ Parses html text and extracts field values :param data: html text (page) :return: a list of urls with author data plus continuation flag """ soup = BeautifulSoup(data, 'html.parser') # extract href from # <a class=\"arow-name c-black\" href=\"\/author\/30230\">...</a> object_list = soup.find_all('a', {'class': 'arow-name c-black'}) if not object_list: raise Parser.IncorrectFormat(data) return [x.get('href') for x in object_list]
class TestOrmMysql: @pytest.fixture(scope='function', autouse=True) def setup(self, orm_client): self.bd = orm_client self.builder = OrmBuilder(orm_client) self.parser = Parser() self.biggest_request, self.client_error, self.server_error = self.parser.parse_logs( log_path='access.log', result='access.log', save_bd=True) def test_biggest_request_insert(self): for biggest_request in self.biggest_request: splitted = biggest_request[1].split() self.builder.add_biggest_request(splitted[0], splitted[2], biggest_request[0]) res = self.bd.session.query(BiggestRequest).all() if len(self.biggest_request) > 10: assert len(res) == 10 else: assert len(res) == len(self.biggest_request) def test_client_error_insert(self): for client_error in self.client_error: self.builder.add_client_error( client_error[0].split(sep=':')[0], int(client_error[0].split(sep=':')[1]), client_error[1]) res = self.bd.session.query(ClientError).all() if len(self.client_error) > 10: assert len(res) == 10 else: assert len(res) == len(self.client_error) def test_server_error_insert(self): for server_error in self.server_error: self.builder.add_server_error( server_error[0].split(sep=':')[0], int(server_error[0].split(sep=':')[1]), server_error[1]) res = self.bd.session.query(ServerError).all() if len(self.server_error) > 10: assert len(res) == 10 else: assert len(res) == len(self.server_error)
def parse_and_write_bank_statement( parser: Parser, src_file: Path, dest_file: Path, rules_dir: Optional[Path], import_transaction: ImportTransactionProtocol, force: bool, dry_run: bool) -> bool: if dest_file.exists(): if force: print(f'WARNING: existing {dest_file} will be overwritten', file=sys.stderr) else: print(f'WARNING: skipping import of already imported {src_file}', file=sys.stderr) return False try: bank_statement = parser.parse(rules_dir=rules_dir) except NotImplementedError as e: print(f'Warning: couldn\'t parse {src_file}:', e.args, file=sys.stderr) return False if not dry_run: try: with open(dest_file, 'w') as f: bank_statement.write_ledger(f) except Exception as e: # Remove hledger file to allow clean import after fixing # whatever caused the Exception. try: dest_file.unlink() except FileNotFoundError: pass raise e else: with io.StringIO() as f: bank_statement.write_ledger(f) print(f.getvalue()) import_transaction.add_file(dest_file) src_ext = src_file.suffix moved_src = dest_file.with_suffix(src_ext) import_transaction.move_file_to_annex(src_file, moved_src) return True
def prepare_capability_matrix(self): self.capability_matrix = { TYPE_LOG_EVENT: dict(), TYPE_FS_CHANGE: dict(), TYPE_NETWORK_PACKET: dict(), } for parserx in Parser.__subclasses__(): parser_instance = parserx() parser_instance.init() parser_capab = parser_instance.get_capabilities() for source in parser_capab['feeders_list']: if not source in self.capability_matrix[ parser_capab['type']].keys(): self.capability_matrix[ parser_capab['type']][source] = list() if not parser_instance in self.capability_matrix[ parser_capab['type']][source]: self.capability_matrix[ parser_capab['type']][source].append(parser_instance)
from generation.generators.frontend.user_generator import UserGenerator as UserModuleGenerator from generation.generators.frontend.shopping_cart_generator import ShoppingCartGenerator as SCGenerator from generation.generators.frontend.auth_generator import AuthGenerator from generation.generators.frontend.home_generator import HomeGenerator from generation.generators.frontend.starter_generator import StarterGenerator from generation.generators.frontend.profile_generator import ProfileGenerator from generation.generators.frontend.product_generator import ProductGenerator from generation.generators.frontend.item_generator import ItemGenerator from generation.generators.frontend.category_generator import CategoryGenerator as CategoryGeneratorFront if __name__ == '__main__': try: shutil.rmtree('./output') except Exception: pass parser = Parser() model = parser.parse(os.path.join(root, "metamodel"), 'scala-angular.tx', 'project.scan', True) main_generator = MainGenerator() model_generator = ModelGenerator(main_generator) table_generator = TableGenerator(main_generator) repository_generator = RepositoryGenerator(main_generator) service_generator = ServiceGenerator(main_generator) controller_generator = ControllerGenerator(main_generator) dto_generator = DTOGenerator(main_generator) jwt_generator = JWTGenerator(main_generator) module_generator = ModuleGenerator(main_generator) conf_generator = ConfGenerator(main_generator) sbt_generator = SbtGenerator(main_generator) category_generator = CategoryGenerator(main_generator) order_generator = OrderGenerator(main_generator)
def page_items(self): return [Parser(e) for e in self.soup.select(Locators.ITEM)]
def __init__(self): self.__load_config() self.parser = Parser() self.csv_maker = CsvMaker() self.page_number = 1
#!/usr/bin/env python from parsers.parser import Parser from controllers.base_controller import BaseController from loggers.controller_logger import ControllerLogger from loggers.request_logger import RequestLogger # Parser parser = Parser() request = parser() # TODO: Check if connection to socket has already been established. # RequestLogger RequestLogger(request.status, request.parsed_args.log_file)() # Controller base_controller = BaseController(request.parsed_args) controller = base_controller() # Logger controller_logger = ControllerLogger( controller.subcontroller.__class__.__name__, controller.subcontroller.action, controller.subcontroller.status, controller.subcontroller.data, request.parsed_args.log_file) controller_logger()
def parseAll(): for country, cities in toParse.items(): for city, url in cities.items(): yield Parser.parse(url, city, country)
def setUpClass(cls): """ We are using this class variable during entire testing. """ ParserTest.parser = Parser(files_folder="example_files")
def setup(self, orm_client): self.bd = orm_client self.builder = OrmBuilder(orm_client) self.parser = Parser() self.biggest_request, self.client_error, self.server_error = self.parser.parse_logs( log_path='access.log', result='access.log', save_bd=True)
def index_data(self): start = time() parser = Parser() self.get_all_files(self.root, parser) end = time() print(end - start)