Beispiel #1
0
        empleo_list = response.xpath("//div[@class='empleo-item']")
        empleo_list = [
            empleo for empleo in empleo_list if empleo.xpath(
                "span[@class='status status--open']/text()").get() is not None
        ]

        for empleo_item in empleo_list:

            oferta = PulinkItem()

            oferta['start_date'] = empleo_item.xpath("time/text()").get()
            oferta['entidad'] = self.entidad
            oferta['ciudad'] = self.ciudad
            oferta['titulo'] = empleo_item.xpath("h2/a/text()").get().replace(
                "\n", '').replace("\r", '')
            oferta['referencia'] = empleo_item.xpath("dl/dd[1]/text()").get()
            oferta['url'] = self.url_site + empleo_item.xpath(
                "h2/a/@href").get()
            oferta['deadline'] = empleo_item.xpath("dl/dd[3]/text()").get()

            yield oferta


if __name__ == "__main__":
    process = CrawlerProcess({
        'FEED_URI': 'data/la_fe.csv',
        'FEED_FORMAT': 'csv'
    })
    process.crawl(spider_la_fe)
    process.start()
Beispiel #2
0
            yield scrapy.Request(next_page, self.parse_reviews_page)

    def parse_page(self, response):
        review_links = []
        for pr in response.css('a.g-rating-reviews-link'):
            rating = pr.css("span.g-rating-stars-i")
            if rating != []:
                review_links.append(pr)

        for link in review_links:
            yield response.follow(link, self.parse_reviews)

    def parse_content(self, response):
        for comment in self.parse_page(response):
            yield comment
        for next_page in response.css(
                'a.paginator-catalog-l-link::attr(href)').extract():
            yield scrapy.Request(next_page, self.parse_page)

    def parse(self, response):
        for next_page in response.css('a.sprite-side.pab-items-i-link'):
            yield response.follow(next_page, self.parse_content)


process = CrawlerProcess({
    'FEED_FROMAT': 'jl',
    'FEED_URI': 'data.json',
    'FEED_EXPORT_ENCODING': 'utf-8'
})
process.crawl(RozetkaSpider)
Beispiel #3
0
#             'catallaxy',
#             '-o',
#             'Export\\test1.csv',
#             '-a',
#             'domain=infowars.com,'
#         ]
#     )
# except SystemExit:
#     pass

from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from urllib.parse import urlparse
import re

process = CrawlerProcess(get_project_settings())

# process.crawl('AnyDomain',
#     DOMAIN='clubtroppo.com.au',
#     MATCH="https:\\/\\/clubtroppo\\.com\\.au\\/\\d+\\/\\d+\\/\\d+\\/[\\w-]+\\/",
#     CRAWL_DIFFBOT=False,
#     FILE_NAME="Test_1")
# process.crawl('AnyDomain',
#     DOMAIN='clubtroppo.com.au',
#     MATCH="^https:\\/\\/clubtroppo\\.com\\.au\\/\\d+\\/\\d+\\/\\d+\\/[\\w-]+\\/$",
#     CRAWL_DIFFBOT=False,
#     FILE_NAME="club")

# process.crawl('AnyDomain',
#     DOMAIN='andrewelder.blogspot.com',
#     MATCH="http:\/\/andrewelder\.blogspot\.com\/\d+\/\d+\/[\w-]+\.html",
                    else:
                        try:
                            myfile.write(article_id + ';' +
                                         r.get('news').get('date') + ';' +
                                         r.get('content') + ';' +
                                         r.get('create_date') + '\n')
                            self.history[article_id] = r.get('id')
                        except:
                            pass
            self.req_error = False
        except:
            self.req_error = True
            return

        with open('history.csv', 'w') as history_file:
            for key, value in self.history.items():
                history_file.write(key + ';' + value + ';\n')


process = CrawlerProcess({
    'DOWNLOAD_DELAY': delay,
    'DOWNLOADER_MIDDLEWARES': {
        'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': None,
        'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware': None,
    },
    # 'RETRY_HTTP_CODES': [500, 502, 503, 504, 408, 302],
    'HTTPERROR_ALLOW_ALL': True,
})
process.crawl(CommentSpider)
process.start()
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
from jobparser import settings
from jobparser.spiders.hhru import HhruSpider
from jobparser.spiders.superjob import SuperjobSpider

if __name__ == '__main__':
    crawler_settings = Settings()
    crawler_settings.setmodule(settings)
    process = CrawlerProcess(settings=crawler_settings)
    process.crawl(HhruSpider)
    process.crawl(SuperjobSpider)
    process.start()
Beispiel #6
0
from scrapy.crawler import CrawlerProcess
from s3suspects.spiders import suspects
from scrapy.utils.project import get_project_settings


if __name__ == '__main__':
    settings = get_project_settings()
    crawler = CrawlerProcess(settings)

    crawler.crawl('suspects')
    crawler.start()
Beispiel #7
0
retmax = 100
year = None

first_arg = sys.argv[1]
second_arg = sys.argv[2]

search_name = first_arg
to_id = second_arg
# Name of the author to investigate

# Global dataframe which contains all data retrieved on parent and child publications
gdf = pd.DataFrame(columns=['date', 'id', 'name', 'layer', 'parent', 'fb'])

# Create a spider to perform the web crawling process
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
url = ""
if year is None:
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=" + format_search(search_name) + "[author]&retmax=" + str(retmax) + "&retmode=json"
else:
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=" + format_search(search_name) + "[author]" + year + "[pubdate]&retmax=" + str(retmax) + "&retmode=json"

print(format_search(search_name))

ChamberSpider.start_urls = [ url ]
process.crawl(ChamberSpider)
x = process.start()

# Output the contents of the dataframe into a csv file
file_name = to_id#str(search_name) + "_data_ms" + str(datetime.datetime.now().microsecond)
Beispiel #8
0
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
from lesson5.gbparse import settings
from lesson5.gbparse.spiders.instagram import InstagramSpider

if __name__ == '__main__':
    scr_settings = Settings()
    scr_settings.setmodule(settings)
    process = CrawlerProcess(settings=scr_settings)
    # process.crawl(GeekbrainsSpider)
    process.crawl(InstagramSpider)
    process.start()

# ресурс: Instagram
# задача:
# Пройти по произвольному списку пользовтателей, предварительно авторизовавшись.
# Извлеч: Список подписчиков и список на кого подписан иследуемый объект.
# Сохранить в Монго таким образом что-бы было удобно и быстро извлекать данные о подписках того или иного пользовтаеля.
Beispiel #9
0
def runSpider(spiderName, userAgent):
    process = CrawlerProcess({'USER_AGENT': userAgent})

    for i in spiderName:
        process.crawl(i)
    process.start()
	def __init__(self):
		self.process = CrawlerProcess()
        print(getattr(Mrjatt, 'link_type')[getattr(Mrjatt, 'x')])
        if getattr(Mrjatt, 'link_type')[getattr(Mrjatt, 'x')] == 'Movies':
            for i in msc_nm:
                yield i
            x = int(
                input('Number of responses are ' + str(len(link) + 1) + ': '))
            yield response.follow('https://www.songs-mp3.net' + link[x - 1],
                                  callback=self.endgame)
        else:
            # print(True)
            for i in range(len(msc_nm)):
                # print(True,msc_nm[i])
                if msc_nm[i] + '.mp3' == getattr(Mrjatt,
                                                 'naam')[getattr(Mrjatt, 'x')]:
                    # print(True)
                    # yield c
                    # print('i am in')
                    yield response.follow('https://www.songs-mp3.net' +
                                          link[i],
                                          callback=self.endgame)

    def endgame(self, response):
        song_link = response.xpath(
            '/html/body/div[2]/div[2]/div/div[3]/div[2]/div[2]/div[2]/a/@href'
        ).extract_first()
        yield {'this_is_the_download_link': song_link}


process = CrawlerProcess(settings=dict)
process.crawl(Mrjatt)
process.start()
Beispiel #12
0

class MySpider(scrapy.Spider):
    name = "quotes"

    def start_requests(self):
        urls = [
            'http://quotes.toscrape.com/page/1/',
            'http://quotes.toscrape.com/page/2/',
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        page = response.url.split("/")[-2]
        filename = f'quotes-{page}.html'
        with open(filename, 'wb') as f:
            f.write(response.body)
        self.log(f'Saved file {filename}')


process = CrawlerProcess(settings={
    "FEEDS": {
        "items.json": {
            "format": "json"
        },
    },
})

process.crawl(MySpider)
process.start()
Beispiel #13
0
def crawl(spider, setting):
    process = CrawlerProcess({**get_project_settings(), **setting})
    process.crawl(spider)
    process.start()
Beispiel #14
0
    def fetch_previous_article(self, html):
        soup = BeautifulSoup(html, features='lxml')

        for a in soup.find_all('a', {'class': 'blog-pager-older-link'}):
            yield Request(a['href'], self.parse)

    def save_article(self, url, html):
        filename = url.split('/')[-1]
        filepath = os.path.join(self.dst_folder, filename)

        LOGGER.info(f'Saving article to {filepath}')
        with open(filepath, 'wb') as f:
            f.write(html)


if __name__ == '__main__':
    # File `urls.yaml` is intentionally not committed in order to hide the identity
    # of the scraped blogs
    with open('urls.yaml') as f:
        urls = yaml.load(f.read())
        start_article_url = urls['blogger']

    crawler = CrawlerProcess({
        'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
        'DOWNLOAD_DELAY': 3
    })
    crawler.crawl(BloggerSpider,
                  start_article_url,
                  dst_folder='data/html/blogger/')
    crawler.start()
Beispiel #15
0
        Rule(
            LinkExtractor(
                #allow=r'https://www.paginasamarillas.com.ar/buscar/q/funerarias/p-\d+/\?tieneCobertura=true'
                allow=r'funerarias/p-\d+
            ), follow=True, callback= "parseador" 
        ),
    )
    
    def parseador(self, response):
        print('url:', response.url)
        sel = Selector(response)
        funerarias = sel.xpath('//div[contains(@class, "figBox")]')
    
        for funeraria in funerarias:
            item = ItemLoader(Articulo(), funeraria)
            item.add_xpath('nombre', './/span[@class="semibold"]/text()', MapCompose(lambda i: i.replace('\n','').replace('\r','').replace('\t','').strip()))
            item.add_xpath('direccion', './/span[@class="directionFig"]/text()', MapCompose(lambda i: i.replace('\n','').replace('\r','').replace('\t','').strip()))
            item.add_xpath('telefono', './/span[@itemprop="telephone"]/text()', MapCompose(lambda i: i.replace('\n','').replace('\r','').replace('\t','').strip()))
            item.add_xpath('comunaregion', './/span[@class="city"]/text()', MapCompose(lambda i: i.replace('\n','').replace('\r','').replace('\t','').strip()))
    
            yield item.load_item()
    
#---------------------------------------------------------------------------

process = CrawlerProcess({
     'FEED_FORMAT': 'csv',
     'FEED_URI': 'output.csv'
})
process.crawl(SeccionAmarillaCrawler)
process.start()
Beispiel #16
0
def runSingleSpider(spider, UA):
    process = CrawlerProcess({'USER_AGENT': UA})
    process.crawl(spider)
    process.start()
Beispiel #17
0
from string import Template
from scrapy.crawler import CrawlerProcess
from spiders.ouc_modules import OucModulesSpider
import os
import json
app = Flask(__name__)

module_json = "modules.json"
module_yml = "data/cos/modules.yml"

if os.path.exists(module_json):
    os.remove(module_json)

crawler = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
    'FEED_FORMAT': 'json',
    'FEED_URI': module_json
})

crawler.crawl(OucModulesSpider)
crawler.start()

if os.path.exists(module_yml):
    os.remove(module_yml)

with open(module_yml, "w") as f:
    f.write('categories:' + '\n')
    f.write('- modules' + '\n')
    f.write('conversations:' + '\n')
    with open(module_json) as mj:
        for item in json.load(mj):
Beispiel #18
0
# $ scrapy crawl myspider -a word="abba"
#

import scrapy


class MySpider(scrapy.Spider):

    name = 'myspider'

    start_urls = ['http://quotes.toscrape.com']

    def __init__(self, word=None, *args, **kwargs):  # <--- receive parameter
        super().__init__(*args, **kwargs)
        self.word = word  # <--- receive parameter

    def parse(self, response):
        print('url:', response.url)
        print('word:', self.word)  # <--- use parameter


# --- it runs without project and saves in `output.csv` ---

from scrapy.crawler import CrawlerProcess

c = CrawlerProcess({
    'USER_AGENT': 'Mozilla/5.0',
})
c.crawl(MySpider, word='tags')  # <--- send parameter
c.start()
Beispiel #19
0
 def handle(self, *args, **options):
     process = CrawlerProcess(get_project_settings())
     process.crawl(NintendoSpider)
     process.crawl(PS4Spider)
     process.start()
Beispiel #20
0
                'div.product-single__description.rte').extract_first()
            des = '\n'.join(
                Selector(text=elemDes).xpath('//p/text()').extract())
            yield {
                'name':
                response.css('h1.product-single__title::text').extract_first(),
                'price':
                price,
                'image_urls':
                response.xpath(
                    '//a[contains(@href, "products")]/img/@src').extract(),
                'description':
                des
            }


########################################################################
# Sortida
########################################################################

process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
    'FEED_FORMAT': 'json',
    'FEED_URI': 'result.json'
})

process.crawl(ProductosSpider)
process.start()

exit(0)
Beispiel #21
0
 def handle(self, *args, **options):
     print('Spinnig up the crawler......')
     process = CrawlerProcess()
     process.crawl(engines.Citizen)
     process.start()
Beispiel #22
0
def spider_task():
    process = CrawlerProcess(get_project_settings())
    process.crawl(HotmovieSpider)
    process.start()
Beispiel #23
0
#settings['USER_AGENT'] = None
settings['ROBOTSTXT_OBEY'] = False
settings['DOWNLOAD_DELAY'] = 1
settings['ITEM_PIPELINES'] =  {
        'redfin.pipelines.SQLiteStoreItemPipeline': 300,
        }
settings['AUTOTHROTTLE_ENABLED'] = True
settings['AUTOTHROTTLE_START_DELAY'] = 5
settings['AUTOTHROTTLE_MAX_DELAY'] = 60
settings['AUTOTHROTTLE_TARGET_CONCURRENCY'] = 1.0
settings['DOWNLOADER_MIDDLEWARES'] = {
        'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware':None,
        'redfin.rotate_useragent.RotateUserAgentMiddleware' :400
        }

    

#if crnt_date in SunDays:
if True:
    print('Run Redfin SF Home')
    process = CrawlerProcess(settings)
    process.crawl("redfin_sf_home")
    process.start()







from scrapy.crawler import CrawlerProcess
import scrapy
import re
import datetime

class ParktakesSpider_level(scrapy.Spider):
    name = 'parktakes_levq'
    def __init__(self, inputq=""):
        self.inputw = inputq
        # self.records_count = 0
    def start_requests(self):
        url = self.inputw
        yield self.make_requests_from_url(url)

    def parse(self, response):
        NAME_SELECTOR = 'table tr'
        for each_td in response.css(NAME_SELECTOR):
            if(each_td.css('tr td p::text').extract_first()):
                if("Records" in str(each_td.css('tr td p::text').extract_first().encode('ascii','ignore'))):
                    records_count.append(each_td.css('tr td p b::text').extract()[2].encode('ascii','ignore'))
                    break

search_url = sys.argv[1]
records_count = []
SETTINGS = {'LOG_ENABLED': False}
process = CrawlerProcess(SETTINGS)
x = ParktakesSpider_level()
process.crawl(x,search_url)
process.start()
print(int(records_count[0]))
Beispiel #25
0
                            "parentUrl": url
                        }))
                except Exception as e:
                    print(e)
            ImageDBTransaction.commit()
            URLDBTransaction = FUTURE.beginTransaction(writePermission=True)
            FUTURE.addElementToIndex(
                encodeURLAsNumber(url, 1),
                bson.dumps({
                    "vec": webPageSummaryVector.tostring(),
                    "language": webPageVector[2],
                    "body": webPageVector[1],
                    "header": webPageVector[3],
                    "url": url
                }), URLDBTransaction)
            URLDBTransaction.commit()
        for href in response.css("a::attr(href)"):
            yield response.follow(href, self.parse)


if __name__ == "__main__":
    FUTURE = Monad("future_urls")
    images = lmdb.open("future_images", map_size=int(1e12), writemap=True)

    process: Callable = CrawlerProcess({
        "USER_AGENT":
        "FUTURE by Roberto Treviño Cervantes. I'am building a safer, faster and more precise Search Engine, if you do not want to be part of the index, report me to [email protected]"
    })
    process.crawl(Indexer)
    process.start()
Beispiel #26
0
 def __init__(self):
     self.crawler = CrawlerProcess(get_project_settings())
Beispiel #27
0
            international_faculty_number = float(
                international_faculty_number.replace(',', ''))
        if student_faculty_ratio != None:
            student_faculty_ratio = float(
                student_faculty_ratio.replace(',', ''))

        yield {
            # 'Status':response.css("div.uni_info").css("li[title*='Status']").css('span.info-setails::text').get(),
            # 'Research Output':response.css("div.uni_info").css("li[title*='Research Output']").css('span.info-setails::text').get(),
            # 'Scholarships':response.css("div.uni_info").css("li[title*='Scholarships']").css('span.info-setails::text').get(),
            # 'Size':response.css("div.uni_info").css("li[title*='Size']").css('span.info-setails::text').get(),
            'uni': name,
            'total_students_number': total_students_number,
            'international_students_number': international_students_number,
            'total_faculty_number': total_faculty_number,
            'international_faculty_number': international_faculty_number,
            'student_faculty_ratio': student_faculty_ratio
        }


process = CrawlerProcess(settings={
    "FEEDS": {
        "raw_values.json": {
            "format": "json",
            "overwrite": True
        },
    },
})

process.crawl(qsRanking)
process.start()
            '//a[contains(@class, "post-tags")]/text()').extract()
        for para in response.xpath(
                '//div[contains(@class, "article-single-content")]/p'):
            i += 1
            yield {
                'Title': name[0],
                'Paragraph': str(i),
                'Text': para.xpath('text()').extract()[0],
                'Tags': tags,
                'URL': response.url
            }


# Tell the script how to run the crawler by passing in settings.
# The new settings have to do with scraping etiquette.
process = CrawlerProcess({
    'FEED_FORMAT': 'json',  # Store data in JSON format.
    'FEED_URI': 'phdessay.json',  # Name our storage file.
    'LOG_ENABLED': False,  # Turn off logging for now.
    'ROBOTSTXT_OBEY': True,
    'USER_AGENT': 'tjeffkessler ([email protected])',
    'AUTOTHROTTLE_ENABLED': True,
    'HTTPCACHE_ENABLED': True,
    'DEBUG': True
})

# Start the crawler with our spider.
process.crawl(EssaySpider)
process.start()
print('Finished scraping at {}'.format(datetime.now()))
        #review date
        review_date=response.css('span.ratingDate::text').extract_first()
        # direct to review text
        review_text = response.css('p.partial_entry::text')
        # extract and clean review text
        review_text = review_text.extract_first()
        # store this in dictonary
        reviews_list.append([response.url, review_title, review_date, review_text])


# Initialize the list
reviews_list=[]
import csv

#
# Run the Spider
process = CrawlerProcess()
process.crawl(TASpider)
process.start()

# reviews_list=list(set(reviews_list))
print(len(reviews_list))
print(reviews_list[-4:])

with open('reviewsData.csv', 'w') as f:
    #configure writer to write standard csv file
    writer = csv.writer(f, delimiter=',')
    writer.writerow(['Site', 'Review_title', 'Review_date', 'Review_paragraph'])
    for item in reviews_list:
        #Write item to f
        writer.writerow([item[0], item[1], item[2], item[3]]) 
def main():
    #target_board = ['Gossiping', 'Stock','NBA']
    process = CrawlerProcess(get_project_settings())
    for board in target_board:
        process.crawl('PTTCrawler', board=board)
        process.start()