empleo_list = response.xpath("//div[@class='empleo-item']") empleo_list = [ empleo for empleo in empleo_list if empleo.xpath( "span[@class='status status--open']/text()").get() is not None ] for empleo_item in empleo_list: oferta = PulinkItem() oferta['start_date'] = empleo_item.xpath("time/text()").get() oferta['entidad'] = self.entidad oferta['ciudad'] = self.ciudad oferta['titulo'] = empleo_item.xpath("h2/a/text()").get().replace( "\n", '').replace("\r", '') oferta['referencia'] = empleo_item.xpath("dl/dd[1]/text()").get() oferta['url'] = self.url_site + empleo_item.xpath( "h2/a/@href").get() oferta['deadline'] = empleo_item.xpath("dl/dd[3]/text()").get() yield oferta if __name__ == "__main__": process = CrawlerProcess({ 'FEED_URI': 'data/la_fe.csv', 'FEED_FORMAT': 'csv' }) process.crawl(spider_la_fe) process.start()
yield scrapy.Request(next_page, self.parse_reviews_page) def parse_page(self, response): review_links = [] for pr in response.css('a.g-rating-reviews-link'): rating = pr.css("span.g-rating-stars-i") if rating != []: review_links.append(pr) for link in review_links: yield response.follow(link, self.parse_reviews) def parse_content(self, response): for comment in self.parse_page(response): yield comment for next_page in response.css( 'a.paginator-catalog-l-link::attr(href)').extract(): yield scrapy.Request(next_page, self.parse_page) def parse(self, response): for next_page in response.css('a.sprite-side.pab-items-i-link'): yield response.follow(next_page, self.parse_content) process = CrawlerProcess({ 'FEED_FROMAT': 'jl', 'FEED_URI': 'data.json', 'FEED_EXPORT_ENCODING': 'utf-8' }) process.crawl(RozetkaSpider)
# 'catallaxy', # '-o', # 'Export\\test1.csv', # '-a', # 'domain=infowars.com,' # ] # ) # except SystemExit: # pass from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings from urllib.parse import urlparse import re process = CrawlerProcess(get_project_settings()) # process.crawl('AnyDomain', # DOMAIN='clubtroppo.com.au', # MATCH="https:\\/\\/clubtroppo\\.com\\.au\\/\\d+\\/\\d+\\/\\d+\\/[\\w-]+\\/", # CRAWL_DIFFBOT=False, # FILE_NAME="Test_1") # process.crawl('AnyDomain', # DOMAIN='clubtroppo.com.au', # MATCH="^https:\\/\\/clubtroppo\\.com\\.au\\/\\d+\\/\\d+\\/\\d+\\/[\\w-]+\\/$", # CRAWL_DIFFBOT=False, # FILE_NAME="club") # process.crawl('AnyDomain', # DOMAIN='andrewelder.blogspot.com', # MATCH="http:\/\/andrewelder\.blogspot\.com\/\d+\/\d+\/[\w-]+\.html",
else: try: myfile.write(article_id + ';' + r.get('news').get('date') + ';' + r.get('content') + ';' + r.get('create_date') + '\n') self.history[article_id] = r.get('id') except: pass self.req_error = False except: self.req_error = True return with open('history.csv', 'w') as history_file: for key, value in self.history.items(): history_file.write(key + ';' + value + ';\n') process = CrawlerProcess({ 'DOWNLOAD_DELAY': delay, 'DOWNLOADER_MIDDLEWARES': { 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': None, 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware': None, }, # 'RETRY_HTTP_CODES': [500, 502, 503, 504, 408, 302], 'HTTPERROR_ALLOW_ALL': True, }) process.crawl(CommentSpider) process.start()
from scrapy.crawler import CrawlerProcess from scrapy.settings import Settings from jobparser import settings from jobparser.spiders.hhru import HhruSpider from jobparser.spiders.superjob import SuperjobSpider if __name__ == '__main__': crawler_settings = Settings() crawler_settings.setmodule(settings) process = CrawlerProcess(settings=crawler_settings) process.crawl(HhruSpider) process.crawl(SuperjobSpider) process.start()
from scrapy.crawler import CrawlerProcess from s3suspects.spiders import suspects from scrapy.utils.project import get_project_settings if __name__ == '__main__': settings = get_project_settings() crawler = CrawlerProcess(settings) crawler.crawl('suspects') crawler.start()
retmax = 100 year = None first_arg = sys.argv[1] second_arg = sys.argv[2] search_name = first_arg to_id = second_arg # Name of the author to investigate # Global dataframe which contains all data retrieved on parent and child publications gdf = pd.DataFrame(columns=['date', 'id', 'name', 'layer', 'parent', 'fb']) # Create a spider to perform the web crawling process process = CrawlerProcess({ 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' }) url = "" if year is None: url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=" + format_search(search_name) + "[author]&retmax=" + str(retmax) + "&retmode=json" else: url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=" + format_search(search_name) + "[author]" + year + "[pubdate]&retmax=" + str(retmax) + "&retmode=json" print(format_search(search_name)) ChamberSpider.start_urls = [ url ] process.crawl(ChamberSpider) x = process.start() # Output the contents of the dataframe into a csv file file_name = to_id#str(search_name) + "_data_ms" + str(datetime.datetime.now().microsecond)
from scrapy.crawler import CrawlerProcess from scrapy.settings import Settings from lesson5.gbparse import settings from lesson5.gbparse.spiders.instagram import InstagramSpider if __name__ == '__main__': scr_settings = Settings() scr_settings.setmodule(settings) process = CrawlerProcess(settings=scr_settings) # process.crawl(GeekbrainsSpider) process.crawl(InstagramSpider) process.start() # ресурс: Instagram # задача: # Пройти по произвольному списку пользовтателей, предварительно авторизовавшись. # Извлеч: Список подписчиков и список на кого подписан иследуемый объект. # Сохранить в Монго таким образом что-бы было удобно и быстро извлекать данные о подписках того или иного пользовтаеля.
def runSpider(spiderName, userAgent): process = CrawlerProcess({'USER_AGENT': userAgent}) for i in spiderName: process.crawl(i) process.start()
def __init__(self): self.process = CrawlerProcess()
print(getattr(Mrjatt, 'link_type')[getattr(Mrjatt, 'x')]) if getattr(Mrjatt, 'link_type')[getattr(Mrjatt, 'x')] == 'Movies': for i in msc_nm: yield i x = int( input('Number of responses are ' + str(len(link) + 1) + ': ')) yield response.follow('https://www.songs-mp3.net' + link[x - 1], callback=self.endgame) else: # print(True) for i in range(len(msc_nm)): # print(True,msc_nm[i]) if msc_nm[i] + '.mp3' == getattr(Mrjatt, 'naam')[getattr(Mrjatt, 'x')]: # print(True) # yield c # print('i am in') yield response.follow('https://www.songs-mp3.net' + link[i], callback=self.endgame) def endgame(self, response): song_link = response.xpath( '/html/body/div[2]/div[2]/div/div[3]/div[2]/div[2]/div[2]/a/@href' ).extract_first() yield {'this_is_the_download_link': song_link} process = CrawlerProcess(settings=dict) process.crawl(Mrjatt) process.start()
class MySpider(scrapy.Spider): name = "quotes" def start_requests(self): urls = [ 'http://quotes.toscrape.com/page/1/', 'http://quotes.toscrape.com/page/2/', ] for url in urls: yield scrapy.Request(url=url, callback=self.parse) def parse(self, response): page = response.url.split("/")[-2] filename = f'quotes-{page}.html' with open(filename, 'wb') as f: f.write(response.body) self.log(f'Saved file {filename}') process = CrawlerProcess(settings={ "FEEDS": { "items.json": { "format": "json" }, }, }) process.crawl(MySpider) process.start()
def crawl(spider, setting): process = CrawlerProcess({**get_project_settings(), **setting}) process.crawl(spider) process.start()
def fetch_previous_article(self, html): soup = BeautifulSoup(html, features='lxml') for a in soup.find_all('a', {'class': 'blog-pager-older-link'}): yield Request(a['href'], self.parse) def save_article(self, url, html): filename = url.split('/')[-1] filepath = os.path.join(self.dst_folder, filename) LOGGER.info(f'Saving article to {filepath}') with open(filepath, 'wb') as f: f.write(html) if __name__ == '__main__': # File `urls.yaml` is intentionally not committed in order to hide the identity # of the scraped blogs with open('urls.yaml') as f: urls = yaml.load(f.read()) start_article_url = urls['blogger'] crawler = CrawlerProcess({ 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)', 'DOWNLOAD_DELAY': 3 }) crawler.crawl(BloggerSpider, start_article_url, dst_folder='data/html/blogger/') crawler.start()
Rule( LinkExtractor( #allow=r'https://www.paginasamarillas.com.ar/buscar/q/funerarias/p-\d+/\?tieneCobertura=true' allow=r'funerarias/p-\d+ ), follow=True, callback= "parseador" ), ) def parseador(self, response): print('url:', response.url) sel = Selector(response) funerarias = sel.xpath('//div[contains(@class, "figBox")]') for funeraria in funerarias: item = ItemLoader(Articulo(), funeraria) item.add_xpath('nombre', './/span[@class="semibold"]/text()', MapCompose(lambda i: i.replace('\n','').replace('\r','').replace('\t','').strip())) item.add_xpath('direccion', './/span[@class="directionFig"]/text()', MapCompose(lambda i: i.replace('\n','').replace('\r','').replace('\t','').strip())) item.add_xpath('telefono', './/span[@itemprop="telephone"]/text()', MapCompose(lambda i: i.replace('\n','').replace('\r','').replace('\t','').strip())) item.add_xpath('comunaregion', './/span[@class="city"]/text()', MapCompose(lambda i: i.replace('\n','').replace('\r','').replace('\t','').strip())) yield item.load_item() #--------------------------------------------------------------------------- process = CrawlerProcess({ 'FEED_FORMAT': 'csv', 'FEED_URI': 'output.csv' }) process.crawl(SeccionAmarillaCrawler) process.start()
def runSingleSpider(spider, UA): process = CrawlerProcess({'USER_AGENT': UA}) process.crawl(spider) process.start()
from string import Template from scrapy.crawler import CrawlerProcess from spiders.ouc_modules import OucModulesSpider import os import json app = Flask(__name__) module_json = "modules.json" module_yml = "data/cos/modules.yml" if os.path.exists(module_json): os.remove(module_json) crawler = CrawlerProcess({ 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)', 'FEED_FORMAT': 'json', 'FEED_URI': module_json }) crawler.crawl(OucModulesSpider) crawler.start() if os.path.exists(module_yml): os.remove(module_yml) with open(module_yml, "w") as f: f.write('categories:' + '\n') f.write('- modules' + '\n') f.write('conversations:' + '\n') with open(module_json) as mj: for item in json.load(mj):
# $ scrapy crawl myspider -a word="abba" # import scrapy class MySpider(scrapy.Spider): name = 'myspider' start_urls = ['http://quotes.toscrape.com'] def __init__(self, word=None, *args, **kwargs): # <--- receive parameter super().__init__(*args, **kwargs) self.word = word # <--- receive parameter def parse(self, response): print('url:', response.url) print('word:', self.word) # <--- use parameter # --- it runs without project and saves in `output.csv` --- from scrapy.crawler import CrawlerProcess c = CrawlerProcess({ 'USER_AGENT': 'Mozilla/5.0', }) c.crawl(MySpider, word='tags') # <--- send parameter c.start()
def handle(self, *args, **options): process = CrawlerProcess(get_project_settings()) process.crawl(NintendoSpider) process.crawl(PS4Spider) process.start()
'div.product-single__description.rte').extract_first() des = '\n'.join( Selector(text=elemDes).xpath('//p/text()').extract()) yield { 'name': response.css('h1.product-single__title::text').extract_first(), 'price': price, 'image_urls': response.xpath( '//a[contains(@href, "products")]/img/@src').extract(), 'description': des } ######################################################################## # Sortida ######################################################################## process = CrawlerProcess({ 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)', 'FEED_FORMAT': 'json', 'FEED_URI': 'result.json' }) process.crawl(ProductosSpider) process.start() exit(0)
def handle(self, *args, **options): print('Spinnig up the crawler......') process = CrawlerProcess() process.crawl(engines.Citizen) process.start()
def spider_task(): process = CrawlerProcess(get_project_settings()) process.crawl(HotmovieSpider) process.start()
#settings['USER_AGENT'] = None settings['ROBOTSTXT_OBEY'] = False settings['DOWNLOAD_DELAY'] = 1 settings['ITEM_PIPELINES'] = { 'redfin.pipelines.SQLiteStoreItemPipeline': 300, } settings['AUTOTHROTTLE_ENABLED'] = True settings['AUTOTHROTTLE_START_DELAY'] = 5 settings['AUTOTHROTTLE_MAX_DELAY'] = 60 settings['AUTOTHROTTLE_TARGET_CONCURRENCY'] = 1.0 settings['DOWNLOADER_MIDDLEWARES'] = { 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware':None, 'redfin.rotate_useragent.RotateUserAgentMiddleware' :400 } #if crnt_date in SunDays: if True: print('Run Redfin SF Home') process = CrawlerProcess(settings) process.crawl("redfin_sf_home") process.start()
from scrapy.crawler import CrawlerProcess import scrapy import re import datetime class ParktakesSpider_level(scrapy.Spider): name = 'parktakes_levq' def __init__(self, inputq=""): self.inputw = inputq # self.records_count = 0 def start_requests(self): url = self.inputw yield self.make_requests_from_url(url) def parse(self, response): NAME_SELECTOR = 'table tr' for each_td in response.css(NAME_SELECTOR): if(each_td.css('tr td p::text').extract_first()): if("Records" in str(each_td.css('tr td p::text').extract_first().encode('ascii','ignore'))): records_count.append(each_td.css('tr td p b::text').extract()[2].encode('ascii','ignore')) break search_url = sys.argv[1] records_count = [] SETTINGS = {'LOG_ENABLED': False} process = CrawlerProcess(SETTINGS) x = ParktakesSpider_level() process.crawl(x,search_url) process.start() print(int(records_count[0]))
"parentUrl": url })) except Exception as e: print(e) ImageDBTransaction.commit() URLDBTransaction = FUTURE.beginTransaction(writePermission=True) FUTURE.addElementToIndex( encodeURLAsNumber(url, 1), bson.dumps({ "vec": webPageSummaryVector.tostring(), "language": webPageVector[2], "body": webPageVector[1], "header": webPageVector[3], "url": url }), URLDBTransaction) URLDBTransaction.commit() for href in response.css("a::attr(href)"): yield response.follow(href, self.parse) if __name__ == "__main__": FUTURE = Monad("future_urls") images = lmdb.open("future_images", map_size=int(1e12), writemap=True) process: Callable = CrawlerProcess({ "USER_AGENT": "FUTURE by Roberto Treviño Cervantes. I'am building a safer, faster and more precise Search Engine, if you do not want to be part of the index, report me to [email protected]" }) process.crawl(Indexer) process.start()
def __init__(self): self.crawler = CrawlerProcess(get_project_settings())
international_faculty_number = float( international_faculty_number.replace(',', '')) if student_faculty_ratio != None: student_faculty_ratio = float( student_faculty_ratio.replace(',', '')) yield { # 'Status':response.css("div.uni_info").css("li[title*='Status']").css('span.info-setails::text').get(), # 'Research Output':response.css("div.uni_info").css("li[title*='Research Output']").css('span.info-setails::text').get(), # 'Scholarships':response.css("div.uni_info").css("li[title*='Scholarships']").css('span.info-setails::text').get(), # 'Size':response.css("div.uni_info").css("li[title*='Size']").css('span.info-setails::text').get(), 'uni': name, 'total_students_number': total_students_number, 'international_students_number': international_students_number, 'total_faculty_number': total_faculty_number, 'international_faculty_number': international_faculty_number, 'student_faculty_ratio': student_faculty_ratio } process = CrawlerProcess(settings={ "FEEDS": { "raw_values.json": { "format": "json", "overwrite": True }, }, }) process.crawl(qsRanking) process.start()
'//a[contains(@class, "post-tags")]/text()').extract() for para in response.xpath( '//div[contains(@class, "article-single-content")]/p'): i += 1 yield { 'Title': name[0], 'Paragraph': str(i), 'Text': para.xpath('text()').extract()[0], 'Tags': tags, 'URL': response.url } # Tell the script how to run the crawler by passing in settings. # The new settings have to do with scraping etiquette. process = CrawlerProcess({ 'FEED_FORMAT': 'json', # Store data in JSON format. 'FEED_URI': 'phdessay.json', # Name our storage file. 'LOG_ENABLED': False, # Turn off logging for now. 'ROBOTSTXT_OBEY': True, 'USER_AGENT': 'tjeffkessler ([email protected])', 'AUTOTHROTTLE_ENABLED': True, 'HTTPCACHE_ENABLED': True, 'DEBUG': True }) # Start the crawler with our spider. process.crawl(EssaySpider) process.start() print('Finished scraping at {}'.format(datetime.now()))
#review date review_date=response.css('span.ratingDate::text').extract_first() # direct to review text review_text = response.css('p.partial_entry::text') # extract and clean review text review_text = review_text.extract_first() # store this in dictonary reviews_list.append([response.url, review_title, review_date, review_text]) # Initialize the list reviews_list=[] import csv # # Run the Spider process = CrawlerProcess() process.crawl(TASpider) process.start() # reviews_list=list(set(reviews_list)) print(len(reviews_list)) print(reviews_list[-4:]) with open('reviewsData.csv', 'w') as f: #configure writer to write standard csv file writer = csv.writer(f, delimiter=',') writer.writerow(['Site', 'Review_title', 'Review_date', 'Review_paragraph']) for item in reviews_list: #Write item to f writer.writerow([item[0], item[1], item[2], item[3]])
def main(): #target_board = ['Gossiping', 'Stock','NBA'] process = CrawlerProcess(get_project_settings()) for board in target_board: process.crawl('PTTCrawler', board=board) process.start()