class Paths(object): search_path = crawl.Crawl(os.path.realpath(os.path.relpath('..', __file__))) @property def root(self): return copy.deepcopy(self.search_path.root) @property def paths(self): return copy.deepcopy(self.search_path.paths) def prepend_path(self, *paths): self.prepend_paths(*paths) def prepend_paths(self, *paths): self.search_paths.append_paths(*paths) def append_path(self, *paths): self.append_paths(*paths) def append_paths(self, *paths): self.search_path.append_paths(*paths) def clear_paths(self): for path in copy.deepcopy(self.search_path.paths): self.search_path.remove_path(path)
def gen_captcha(self, batch_size=50): X = np.zeros([batch_size, self.height, self.width, 1]) img = np.zeros((self.height, self.width), dtype=np.uint8) Y = np.zeros([batch_size, self.char_num, self.classes]) #image = ImageCaptcha(width = self.width, height = self.height, font=self.font, font_sizes=56) craw = crawl.Crawl() while True: for i in range(batch_size): #captcha_str = ''.join(random.sample(self.characters, self.char_num)) #img = image.generate_image(captcha_str).convert('L') craw.Start() captcha_str = craw.CaptchaString img = craw.CaptchaImage img = np.array(img.getdata()) X[i] = np.reshape(img, [self.height, self.width, 1]) / 255.0 for j, ch in enumerate(captcha_str): Y[i, j, self.characters.find(ch)] = 1 Y = np.reshape(Y, (batch_size, self.char_num * self.classes)) yield X, Y
import sys sys.path.append(sys.path[0].replace('/list', '')) sys.path.append(sys.path[0].replace('\\list', '')) import crawl crawler = crawl.Crawl('http://www.ttt.uz') dictionary = { 'year': ['Год выпуска', 'Год', 'Chiqarilgan Yili'], 'country': ['Страна', 'Davlat'], 'genre': ['Жанр', 'Janr'], 'time': ['Время', 'Davomiyligi'], 'director': ['Режиссер', 'Rejisyor'], 'actors': ['В главных ролях', 'Bosh ro’lda'], 'version': ['Версия'], 'dev': ['Разработчик'], 'lang': ['Язык'], 'tbd': ['Перевод', 'Tarjima'] } crawler.require = './/div[@class="entry-content"]' def scrape(t): t.br_replacer = ' ‧ ' t.dropFromMain('.//script') t.set_main('description') t.get_data('title', './/h1[@class="entry-title"]') t.get_atr('img', './/img[@class="aligncenter wp-post-image"]/@src')
import crawl c = crawl.Crawl() c.getListPage()
import os import datetime import crawl import write_to_csv import to_json import clean_csv import logger import parse if __name__ == '__main__': project_name = datetime.datetime.now().strftime("%d_%m_%Y") logger.write('Crawling start') web_crawler = crawl.Crawl() web_crawler.crawl(project_name) logger.write('Crawling end') csv_path = 'data.csv' to_csv = write_to_csv.ToCSV(csv_path) rootDir = 'thedailystar/' + project_name i = 0 for current_folder, sub_folders, fileList in os.walk(rootDir): # Skip the folders that has bangla news if 'bangla' in current_folder: # print(current_folder) continue for file in fileList: if file.endswith('.html') and '__rss' not in file: full_path = os.path.join(current_folder, file) normalised_html_file_path = os.path.normpath(full_path)
import sys sys.path.append(sys.path[0].replace('/list', '')) sys.path.append(sys.path[0].replace('\\list', '')) import crawl crawler = crawl.Crawl('http://megasoft.uz', 'windows-1251') crawler.require = './/a[starts-with(@href, "/get/")]' def condition(t, xpath, xpath2): t.mainEl = t.lxml.xpath(xpath) return len(t.mainEl) != 0 crawler.condition = condition stringSel = { "os" : "Система:", "size" : "Размер файла: ", "lang" : "Язык интерфейса: " } other = { "publishDate" : "Добавлено:", "downloadCount": "Количество загрузок: " } xpathstr = {} xpath = {} template = './/table[@width="300"]//td[text()="{0}"]/following-sibling::td' for x in stringSel: xpathstr[x] = template.format(stringSel[x]) for x in other: xpath[x] = template.format(other[x])
import sys sys.path.append(sys.path[0].replace('/list', '')) sys.path.append(sys.path[0].replace('\\list', '')) import crawl crawler = crawl.Crawl('http://mytube.uz') crawler.limited = False crawler.require = './/div[@class="WhiteBlock CommentsBlock"]' def scrape(t): t.get_data('title', './/h2') t.get_data('description', './/div[@id="aboutUser"]/pre') t.get_data('category', './/span[@class="userinfobox-categories-tags-container"]/a[1]') t.get_data_array( 'tags', './/span[@class="userinfobox-categories-tags-container"]/a[not(position()=1)]' ) t.get_data_int('views', './/div[@class="Views-Container"]') t.get_data_date('publishDate', './/div[@class="Date"]/text()[last()]') crawler.scrape = scrape crawler.urlNotContains.extend(('/uz/', '/oz/')) crawler.crawl()
import sys sys.path.append(sys.path[0].replace('/list', '')) sys.path.append(sys.path[0].replace('\\list', '')) import crawl crawler = crawl.Crawl('http://tas-ix.me/') crawler.require = './/table[@id="topic_main"]//div[@class="post_wrap"]' crawler.require2 = './/fieldset[@class="attach"]' def condition(self, xpath1, xpath2): self.mainEl = self.lxml.find(xpath1) if self.mainEl is not None: return self.mainEl.find(xpath2) is not None else: return False crawler.condition = condition def scrape(t): t.dropFromMain('.//fieldset[@class="attach"]') t.dropFromMain('//div[@class="sp-body" and @title="MediaInfo"]') t.dropFromMain('//script') t.set_main('description') t.get_data('title', './/h1[@class="maintitle"]') t.get_data_array('category', '(.//td[@class="nav w100"])[1]/a[not(position()=1)]')
import sys sys.path.append(sys.path[0].replace('/list', '')) sys.path.append(sys.path[0].replace('\\list', '')) import crawl crawler = crawl.Crawl('http://topmusic.uz', 'windows-1251') def condition(t, req1, req2): if '/album-' in t.url: crawler.scrape = albumScrape return True else: crawler.scrape = artistScrape if t.lxml.find('.//div[@id="clips_section"]') is not None: return True elif t.lxml.find('.//div[@id="singls_section"]') is not None: return True else: return False def artistScrape(t): t.lxml = t.lxml.find('.//div[@class="box-mid"]') t.get_data('title', './/h2[1]') t.get_data('genre', './div[1]//a[1]') t.get_atr_array('clip', './/div[@class="clip-box"]/a[3]/@title', 20) #cuts 20chars in the beginning t.get_atr_array('single', './/a[@class="play-track"]/@title')
def __init__(self): self.url_table = url_table.UrlTable() self.crawl = crawl.Crawl() self.webpage_parse = webpage_parse.WebPageParse() self.webpage_save = webpage_save.WebPageSave()
import sys sys.path.append(sys.path[0].replace('/list', '')) sys.path.append(sys.path[0].replace('\\list', '')) import crawl crawler = crawl.Crawl('https://mover.uz') crawler.limited = False crawler.require = './/h1[@class="fl video-title"]' def scrape(t): t.set_main('title') t.get_data('description', './/div[@class="desc-text"]') t.data['description'] = t.data['description'][:-21] # print t.data['description'] t.get_data('category', './/p[@class="cat-date"]/a') t.get_data_array('tags', './/p[@class="tags"]/a') t.get_data_int('views', './/span[@class="fr views"]/strong') t.get_data_int('likes', './/table[@class="r-desc"]/tr/td[@class="like"]') t.get_data_int('dislikes', './/table[@class="r-desc"]/tr/td[@class="dislike"]') t.get_data_date('publishDate', './/p[@class="cat-date"]/text()[1]') crawler.scrape = scrape # crawler.urlNotContains.extend() crawler.crawl()
import sys sys.path.append(sys.path[0].replace('/list', '')) sys.path.append(sys.path[0].replace('\\list', '')) import crawl crawler = crawl.Crawl('http://mediabox.uz/ru') crawler.require = './/p[@class="col-lg-12 inner_title"]' selector = { "release" : "Год:", "country" : "Страна:", "genre" : "Жанр:", "subtitle": "Слоган:", "budget" : "Бюджет:", "producer": "Продюсер:", "director": "Режиссёр:", "actor" : "Актеры:", "lang" : "Язык:", "time" : "Время:" } xpath = {} for x in selector: xpath[x] = './/div[@id="info"]//td[text()="'+selector[x]+'"]/following-sibling::td' def scrape(t): t.set_main('title') t.get_atr('img', './/div[@class="cover"]/img/@src') for x in xpath: t.get_data(x, xpath[x]) t.get_data('forAge', './/div[@id="info"]/td[contains(., "Возраст:")]/following-sibling::td/b')