class Scraper: def __init__(self): self.persistor = Persistor() def scrape(self): #Gives the text from zigzag website url = 'https://www.zigzag.am/am/tv-audio-video/tvs.html' # opening the page and taking the data data = uReq(url) page_html = data.read() # response = requests.get(url) data.close() # #html parsing # page_soup = BeautifulSoup(page_html, "html.parser") # # print(page_soup.body) # #for each product # categories = page_soup.findAll("div", {"class":"item_category"}) # print(category[0]) # for category in categories: # item = category.a["item_name item_link"] # title = category.findAll("a", {"class":"item_name"}) # name = title[0].text # print("item: ", item) # print ("title: ", title ) # print("name: ", name) self.persistor.save_raw_data(page_html, "tvs.html")
def parse(): # parse gathered data and save as csv logger.info("parse") storage = Persistor() parser = Parser() raw_data = storage.read_raw_data() parsed_files = parser.parse_object(raw_data) storage.save_csv(parsed_files)
def parse(): logger.info("parse") storage = Persistor() parser = Parser() raw_data = storage.read_raw_data(SCRAPPED_FILE) data = parser.process_rawdata(raw_data) #processing raw data parsed_files = [parser.parse_object(file) for file in data] #parsing every object storage.save_csv(parsed_files, TABLE_FORMAT_FILE) #save our data
def parse(): # parse gathered data and save as csv logger.info("parse") storage = Persistor(SCRAPPED_FILE, TABLE_FORMAT_FILE) parser = Parser() raw_data = storage.read_raw_data() parsed_file = parser.parse_object(raw_data) #parsed_files = [parser.parse_object(file) for file in raw_data] storage.save_csv(parsed_file)
def gather(): print('here') logger.info("gather") storage = Persistor(SCRAPPED_FILE) scrapper = Scraper(storage) for year in range(1903, int(datetime.datetime.now().year)): scrapper.scrape(year)
def parse(): logger.info("parse") storage = Persistor(SCRAPPED_FILE) parser = Parser() raw_data = storage.read_raw_data() ind_start = raw_data.find('table class=\"wikitable sortable\"') raw_data = raw_data[ind_start:] ind_end = raw_data.find('</table>') raw_data = raw_data[:ind_end + len('</table>')] all_rows = re.findall('<tr[^^]*?</tr>', raw_data) parsed_files = [parser.parse_object(raw) for raw in all_rows] storage.save_csv(parsed_files, TABLE_FORMAT_FILE)
def parse(): # parse gathered data and save as csv logger.info("parse") storage = Persistor(SCRAPPED_FILE) parser = Parser() for year in range(1903, int(datetime.datetime.now().year)): raw_data = storage.read_raw_data(year) parsed_file = parser.parse_object(raw_data) storage.append_data(parsed_file) storage.save_csv(TABLE_FORMAT_FILE)
def gather(): logger.info("gather") storage = Persistor() scrapper = Scraper(storage) scrapper.scrape(SCRAPPED_FILE)
def __init__(self): self.persistor = Persistor()