コード例 #1
0
ファイル: marriott.py プロジェクト: iandees/all-the-places
    def parse(self, response):
        xml = Selector(response)
        xml.remove_namespaces()

        urls = xml.xpath('//loc/text()').extract()
        for url in urls:
            yield scrapy.Request(response.urljoin(url), callback=self.parse_hotel)
コード例 #2
0
    def parse(self, response):
        xml = Selector(response)
        xml.remove_namespaces()

        urls = xml.xpath('//loc/text()').extract()

        for url in urls:
            yield scrapy.Request(url, callback=self.parse_store)
コード例 #3
0
ファイル: xml.py プロジェクト: 01-/portia
 def _extract_links(self, response):
     type = 'html'
     if response.body_as_unicode().strip().startswith('<?xml version='):
         type = 'xml'
     xxs = Selector(response, type=type)
     if self.remove_namespaces:
         xxs.remove_namespaces()
     for url in xxs.xpath(self.xpath).extract():
         yield Link(url.encode(response.encoding))
コード例 #4
0
 def parse(self, response):
     sel = Selector(response)
     sel.remove_namespaces()
     mbid = sel.xpath("//release-group[1]/@id").extract()
     if mbid:
         yield Request("http://coverartarchive.org/release-group/" +
                       mbid[0],
                       callback=self.parse_album,
                       meta=response.meta)
コード例 #5
0
ファイル: goldsgym.py プロジェクト: zanachka/alltheplaces
    def parse(self, response):
        xml = Selector(response)
        xml.remove_namespaces()

        urls = xml.xpath('//loc/text()').extract()
        for url in urls:
            path = "/".join(urlparse(url).path.split('/')[:-1])
            yield scrapy.Request(response.urljoin(path),
                                 callback=self.parse_hotel)
コード例 #6
0
 def _extract_links(self, response):
     type = 'html'
     if response.body_as_unicode().strip().startswith('<?xml version='):
         type = 'xml'
     xxs = Selector(response, type=type)
     if self.remove_namespaces:
         xxs.remove_namespaces()
     for url in xxs.xpath(self.xpath).extract():
         yield Link(url.encode(response.encoding))
コード例 #7
0
ファイル: kroger.py プロジェクト: vinmay/alltheplaces
    def parse(self, response):
        xml = Selector(response)
        xml.remove_namespaces()

        urls = xml.xpath('//loc/text()').extract()
        for url in urls:
            yield scrapy.Request(url,
                                 callback=self.parse_store,
                                 headers=DEFAULT_HEADERS,
                                 meta=response.meta)
コード例 #8
0
 def parse_item(self, response):
     #self.log('Crawling the link: %s' % response.url)
     sel = Selector(response)
     sel.remove_namespaces()
     topic = sel.xpath("//div[@id='SHOW_TOPIC']")
     item = TripAdvisorForumPostItem()
     item['text'] = topic.xpath("//div[@class='postBody']").extract() #topic.xpath("//p/text()").extract()
     #item['date'] = topic.xpath("//div[@class='postDate']/text()").extract()
     item['link'] = response.url
     return item
コード例 #9
0
    def parse(self, response):
        xml = Selector(response)
        xml.remove_namespaces()

        urls = xml.xpath("//loc/text()").extract()
        urls = [url.strip() for url in urls]
        for url in urls:
            path = scrapy.utils.url.parse_url(url).path
            if re.match(r"^/.*/.*/.*$", path):
                yield scrapy.Request(url, callback=self.parse_location)
コード例 #10
0
    def parse(self, response):
        xml = Selector(response)
        xml.remove_namespaces()

        urls = xml.xpath('//loc/text()').extract()

        for url in urls:
            if re.match(r'^https://99only.com/stores/.+?$', url):
                if 'near-me' in url:
                    continue
                yield scrapy.Request(url, callback=self.parse_location)
コード例 #11
0
    def parse(self, response):
        xml = Selector(response)
        xml.remove_namespaces()

        urls = xml.xpath('//loc/text()').extract()
        urls = [url.strip() for url in urls]

        for url in urls:
            if '/home/' in url:
                continue
            yield scrapy.Request(url, callback=self.parse_state_sitemap)
コード例 #12
0
ファイル: xml.py プロジェクト: daqv/portia-dashboard
 def _extract_links(self, response):
     body = response.body_as_unicode()
     _type = 'html'
     if body.lstrip().startswith('<?xml version='):
         _type = 'xml'
     xxs = Selector(text=body, type=_type)
     if self.remove_namespaces:
         xxs.remove_namespaces()
     for url in xxs.xpath(self.xpath).extract():
         if not isinstance(url, six.text_type):
             url = url.encode(response.encoding)
         yield Link(url)
コード例 #13
0
    def parse_state_sitemap(self, response):
        xml = Selector(response)
        xml.remove_namespaces()

        urls = xml.xpath('//loc/text()').extract()
        urls = [url.strip() for url in urls]

        for url in urls:
            # store urls follow this pattern:
            # url must be 3 segments (state/city/address) and the last one cannot be a postalcode only
            if re.match(r'^https://pizza.dominos.com/.*?/.*?/(?!(\d{5}/)).*?/$', url):
                yield scrapy.Request(url, callback=self.parse_place)
コード例 #14
0
ファイル: metrodiner.py プロジェクト: zanachka/alltheplaces
    def parse(self, response):
        xml = Selector(response)
        xml.remove_namespaces()

        urls = xml.xpath('//loc/text()').extract()
        urls = [url.strip() for url in urls]

        for url in urls:
            if re.search(r'orlando-university', url):
                pass
            elif re.search(r'locations/', url):
                yield scrapy.Request(url, callback=self.parse_location)
コード例 #15
0
    def parse_state_sitemap(self, response):
        xml = Selector(response)
        xml.remove_namespaces()

        urls = xml.xpath('//loc/text()').extract()
        urls = [url.strip() for url in urls]

        # individual store pages are listed at top, then a state page, then bunch of other non-store pages
        # find the index position of the state page and then only parse urls before that
        i = urls.index(
            re.search(r'^(https://locations.rentacenter.com/.+?)/.*$',
                      urls[0]).groups()[0] + '/')
        for url in urls[:i]:
            yield scrapy.Request(url, callback=self.parse_location)
コード例 #16
0
    def parse(self, response):
        content_selector = Selector(text=response.body)
        content_selector.register_namespace('arxiv', 'http://arxiv.org/schemas/atom')
        content_selector.register_namespace('xmlns', 'http://www.w3.org/2005/Atom')
        content_selector.remove_namespaces()

        for line in content_selector.xpath('//feed/entry'):
            item = TestItem()
            item['id'] = line.xpath('id/text()').extract()
            item['title'] = line.xpath('title/text()').extract()
            item['links'] = line.xpath('link/@href').extract()
            item['authors'] = line.xpath('author/name/text()').extract()
            item['comments'] = line.xpath('comment/text()').extract()
            item['primary_category'] = line.xpath('primary_category/@term').extract()
            item['categories'] = line.xpath('category/@term').extract()
            item['summary'] = line.xpath('summary/text()').extract()
            yield item
コード例 #17
0
    def parse_job_index(self, response):
        for job in super(ATSSpider2_RecursiveList, self).parse_job_index(response):
            yield job

        if self.ruleindex is None:
            raise CloseSpider('No jobs found.')

        sel = Selector(response)
        sel.remove_namespaces()

        loader = ItemLoader(selector=sel)

        rule = arg_to_iter(self.parse_job_index_rules)[self.ruleindex].get('recursive')

        if rule is None:
            raise CloseSpider('parse_job_index_rules has no recursive rules')

        if self.extract_from_rule(loader, rule.get('condition')):
            if self.extract_from_rule(loader, rule.get('from_response', {'static': False})):
                url = self.extract_from_rule(loader, rule.get('url'))
                if url:
                    yield FormRequest.from_response(
                        response,
                        url=url,
                        formdata=self.extract_from_rule(loader, rule.get('formdata')),
                        callback=self.parse_job_index,
                        dont_filter=self.extract_from_rule(loader, rule.get('dont_filter', {'static': False})),
                        dont_click=self.extract_from_rule(loader, rule.get('dont_click', {'static': False}))
                    )
                else:
                    yield FormRequest.from_response(
                        response,
                        formdata=self.extract_from_rule(loader, rule.get('formdata')),
                        callback=self.parse_job_index,
                        dont_filter=self.extract_from_rule(loader, rule.get('dont_filter', {'static': False})),
                        dont_click=self.extract_from_rule(loader, rule.get('dont_click', {'static': False}))
                    )
            else:
                yield FormRequest(
                    url=self.extract_from_rule(loader, rule.get('url')),
                    formdata=self.extract_from_rule(loader, rule.get('formdata')),
                    callback=self.parse_job_index,
                    dont_filter=self.extract_from_rule(loader, rule.get('dont_filter', {'static': False}))
                )
コード例 #18
0
    def parse_item(self, response):
        sel = Selector(response)
        sel.remove_namespaces()
        reviews = sel.xpath("//div[@class='reviewSelector ']")
        items =[]
        for review in reviews:
            item = TripAdvisorAttractionReviewItem()
            item['text'] = review.xpath("div[@class='review basic_review inlineReviewUpdate provider0 newFlag']/div[@class='col2of2']/div[@class='innerBubble']/div[@class='wrap']/div[@class='entry']/p/text()").extract()
            if not item['text']:
                continue
            score = review.xpath("div[@class='review basic_review inlineReviewUpdate provider0 newFlag']/div[@class='col2of2']/div[@class='innerBubble']/div[@class='wrap']/div[@class='rating reviewItemInline']/span/img/@alt").extract()

            if score:
                regex = re.compile("(\d)")
                r = regex.findall(str(score))#IE: 4 of 5 stars
                item['score'] = float(r[0])#then 4
                item['max_score'] = float(r[1])#then 5

            items.append(item)
        return items
コード例 #19
0
    def parse_journal_issue(self, dataset, target_folder, filename):
        """Parse journal issue tags and files if there is any in the dataset.xml.
        The journal issue xmls are containing all the dois for artciles in that issue. Extract this data,
        and later update it from the journal item xml.
        """

        data = []

        for issue in dataset.xpath('//journal-issue'):
            tmp = {
                'volume': "%s %s" % (issue.xpath('//volume-issue-number/vol-first/text()')[0].extract(),
                                     issue.xpath('//volume-issue-number/suppl/text()')[0].extract()),
            }
            issue_file = os.path.join(target_folder, filename,
                                      issue.xpath('./files-info/ml/pathname/text()')[0].extract())

            self.log('Parsing journal issue xml: %s' % issue_file, logging.INFO)

            articles = {}
            with open(issue_file, 'r') as issue_file:
                iss = Selector(text=issue_file.read())
                iss.remove_namespaces()
                for article in iss.xpath('//include-item'):
                    doi = article.xpath('./doi/text()')[0].extract()

                    first_page = None
                    if article.xpath('./pages/first-page/text()'):
                        first_page = article.xpath('./pages/first-page/text()')[0].extract()

                    last_page = None
                    if article.xpath('./pages/last-page/text()'):
                        last_page = article.xpath('./pages/last-page/text()')[0].extract()

                    articles[doi] = {'first-page': first_page,
                                     'last-page': last_page}

            tmp['articles'] = articles
            data.append(tmp)

        return data
コード例 #20
0
    def handle_package(self, response):
        """Handle the zip package and yield a request for every XML found."""
        import traceback
        with open(response.meta["local_filename"], 'w') as destination_file:
            destination_file.write(response.body)
        filename = os.path.basename(
            response.url).rstrip("A.tar").rstrip('.zip')
        # TMP dir to extract zip packages:
        target_folder = mkdtemp(prefix=filename + "_",
                                dir=ELSEVIER_UNPACK_FOLDER)

        zip_filepath = response.meta["local_filename"]
        files = uncompress(zip_filepath, target_folder)

        # The xml files shouldn't be removed after processing; they will
        # be later uploaded to Inspire. So don't remove any tmp files here.
        try:
            for f in files:
                if 'dataset.xml' in f:
                    from scrapy.selector import Selector
                    with open(f, 'r') as dataset_file:
                        dataset = Selector(text=dataset_file.read())
                        data = []
                        for i, issue in enumerate(
                                dataset.xpath('//journal-issue')):
                            tmp = {}
                            tmp['volume'] = "%s %s" % (issue.xpath(
                                '//volume-issue-number/vol-first/text()'
                            )[0].extract(
                            ), issue.xpath('//volume-issue-number/suppl/text()'
                                           )[0].extract())
                            tmp['issue'] = issue.xpath(
                                '//issn/text()')[0].extract()
                            issue_file = os.path.join(
                                target_folder, filename,
                                issue.xpath('./files-info/ml/pathname/text()')
                                [0].extract())
                            arts = {}
                            with open(issue_file, 'r') as issue_file:
                                iss = Selector(text=issue_file.read())
                                iss.remove_namespaces()
                                for article in iss.xpath('//include-item'):
                                    doi = article.xpath(
                                        './doi/text()')[0].extract()
                                    first_page = None
                                    if article.xpath(
                                            './pages/first-page/text()'):
                                        first_page = article.xpath(
                                            './pages/first-page/text()'
                                        )[0].extract()
                                    last_page = None
                                    if article.xpath(
                                            './pages/last-page/text()'):
                                        last_page = article.xpath(
                                            './pages/last-page/text()'
                                        )[0].extract()
                                    arts[doi] = {
                                        'files': {
                                            'xml': None,
                                            'pdf': None
                                        },
                                        'first-page': first_page,
                                        'last-page': last_page
                                    }
                            tmp['articles'] = arts
                            data.append(tmp)
                        tmp_empty_data = 0
                        if not data:
                            tmp_empty_data = 1
                            data.append({
                                'volume': None,
                                'issue': None,
                                'articles': {}
                            })
                        for article in dataset.xpath('//journal-item'):
                            doi = article.xpath(
                                './journal-item-unique-ids/doi/text()'
                            )[0].extract()
                            if article.xpath(
                                    './journal-item-properties/online-publication-date/text()'
                            ):
                                publication_date = article.xpath(
                                    './journal-item-properties/online-publication-date/text()'
                                )[0].extract()[:18]
                            else:
                                publication_date = datetime.datetime.now(
                                ).strftime("%Y-%m-%dT%H:%M:%S")
                            journal = article.xpath(
                                './journal-item-unique-ids/jid-aid/jid/text()'
                            )[0].extract()
                            if journal == "PLB":
                                journal = "Physics Letters B"
                            if journal == "NUPHB":
                                journal = "Nuclear Physics B"

                            if tmp_empty_data:
                                data[0]['articles'][doi] = {
                                    'files': {
                                        'xml': None,
                                        'pdf': None
                                    },
                                    'first-page': None,
                                    'last-page': None,
                                }
                            for i, issue in enumerate(data):
                                if doi in data[i]['articles']:
                                    data[i]['articles'][doi][
                                        'journal'] = journal
                                    data[i]['articles'][doi][
                                        'publication-date'] = publication_date
                                    xml = os.path.join(
                                        target_folder, filename,
                                        article.xpath(
                                            './files-info/ml/pathname/text()')
                                        [0].extract())
                                    pdf = os.path.join(
                                        target_folder, filename,
                                        article.xpath(
                                            './files-info/web-pdf/pathname/text()'
                                        )[0].extract())
                                    data[i]['articles'][doi]['files'][
                                        'xml'] = xml
                                    data[i]['articles'][doi]['files'][
                                        'pdf'] = pdf
                                    if 'vtex' in zip_filepath:
                                        pdfa = os.path.join(
                                            os.path.split(pdf)[0],
                                            'main_a-2b.pdf')
                                        pdfa = os.path.join(
                                            target_folder, pdfa)
                                        data[i]['articles'][doi]['files'][
                                            'pdfa'] = pdfa
                        for i, issue in enumerate(data):
                            print('a')
                            for doi in data[i]['articles']:
                                print('b')
                                try:
                                    print('try')
                                    xml_file = open(
                                        data[i]['articles'][doi]['files']
                                        ['xml'], 'r')
                                    print(xml_file)
                                    xml_file_content = xml_file.read()
                                    for nodename in self.itertag:
                                        print(nodename)
                                        for selector in xmliter(
                                                xml_file_content, nodename):
                                            print(selector)
                                            yield self.parse_node(
                                                data[i], doi, zip_filepath,
                                                selector)
                                except:
                                    print(traceback.print_exc())
        except:
            import traceback
            traceback.print_exc()
コード例 #21
0
 def parse(self, response):
 	sel = Selector(response)
 	sel.remove_namespaces()
     mbid = sel.xpath("//release-group[1]/@id").extract()
     if mbid:
         yield Request("http://coverartarchive.org/release-group/"+mbid[0],callback=self.parse_album,meta=response.meta)
コード例 #22
0
        def parse(self, response):
                x = Selector(response)
                x.remove_namespaces()
                items = []
                items = x.xpath('//record/metadata/RDF')

                jsons = []

                for item in items:
				
					media_type = ""
					layer_type = ""

					# get ending of "alternative" title, which gives file extension
					file_ext = ""
					if not item.xpath('Resource/alternative/Description/value/text()').extract() == []:
						file_ext = item.xpath('Resource/alternative/Description/value/text()').extract()[0].split('.')[-1]
							
						# we don't want these	
						if file_ext == "cvs" or file_ext == "CVS" or file_ext == "xls" or file_ext == "XLS" or file_ext == "txt" or file_ext == "TXT":
							continue;
						if file_ext == "pdf" or file_ext == "PDF" or file_ext == "doc" or file_ext == "DOC" or file_ext == "docx" or file_ext == "DOCX" or file_ext == "rtf" or file_ext == "RTF":
							media_type = "Document"
							layer_type = file_ext.upper()
						if file_ext == "jpg" or file_ext == "JPG" or file_ext == "jpeg" or file_ext == "JPEG" or file_ext == "png" or file_ext == "PNG":
							media_type = "Image"
							layer_type = "Image"
						if file_ext == "mp3" or file_ext == "MP3" or file_ext == "wav" or file_ext == "WAV" or file_ext == "m4a" or file_ext == "M4A" or file_ext == "wma" or file_ext == "WMA":
							media_type = "Audio"
							layer_type = "Audio"
						if file_ext == "mp4" or file_ext == "MP4" or file_ext == "mpeg" or file_ext == "MPEG" or file_ext == "wmv" or file_ext == "WMV" or file_ext=="AVI" or file_ext == "avi":
							media_type = "Video"
							layer_type = "Video"
						

					# NDL link 
					format = item.xpath('Resource/materialType/@resource').extract()[0]
					if media_type == "" and layer_type == "":
						if format == "http://purl.org/dc/dcmitype/StillImage" or format == "http://ndl.go.jp/ndltype/Photograph":
							media_type = "Image"
							layer_type = "Image"
						if format == "http://purl.org/dc/dcmitype/MovingImage":
							media_type = "Video"
							layer_type = "Video"
						if format == "http://purl.org/dc/dcmitype/Sound":
							media_type = "Audio"
							layer_type = "Audio"
							
					#Still Nothing
					if media_type == "" and layer_type == "":
						if not item.xpath('Resource/accessURL/@resource').extract() == []:
							file_ext = item.xpath('Resource/accessURL/@resource').extract()[0].split('.')[-1]
							# we don't want these	
							if file_ext == "cvs" or file_ext == "CVS" or file_ext == "xls" or file_ext == "XLS" or file_ext == "txt" or file_ext == "TXT":
								continue;
							if file_ext == "pdf" or file_ext == "PDF" or file_ext == "doc" or file_ext == "DOC" or file_ext == "docx" or file_ext == "DOCX" or file_ext == "rtf" or file_ext == "RTF":
								media_type = "Document"
								layer_type = file_ext.upper()
							if file_ext == "jpg" or file_ext == "JPG" or file_ext == "jpeg" or file_ext == "JPEG" or file_ext == "png" or file_ext == "PNG":
								media_type = "Image"
								layer_type = "Image"
							if file_ext == "mp3" or file_ext == "MP3" or file_ext == "wav" or file_ext == "WAV" or file_ext == "m4a" or file_ext == "M4A" or file_ext == "wma" or file_ext == "WMA":
								media_type = "Audio"
								layer_type = "Audio"
							if file_ext == "mp4" or file_ext == "MP4" or file_ext == "mpeg" or file_ext == "MPEG" or file_ext == "wmv" or file_ext == "WMV" or file_ext=="AVI" or file_ext == "avi":
								media_type = "Video"
								layer_type = "Video"
							
					creator = item.xpath('Resource/publisher/Agent/name/text()').extract()
					title = item.xpath('Resource/title/Description/value/text()').extract()
					uri = item.xpath('Resource/@about').extract()
					attribution_uri = item.xpath('MetaResource/@about').extract()
					date_created = item.xpath('MetaResource/created/text()').extract()
					access_url = item.xpath('Resource/accessURL/@resource').extract()
					
					if media_type == "Image" or media_type == "Audio" or media_type == "Video" or media_type == "Document":
						if not access_url == []:
							uri = access_url
					
						
						
					if not date_created:
						media_date_created = "0000-00-00 00:00:00"
					else:
						date = date_created[0].split('T')[0]
						time = date_created[0].split('T')[1].split('+')[0]
						media_date_created = date + ' ' + time
			
					
					
					tags = item.xpath('Resource/subject/text()').extract()
					tags_string = ""
					new_tags = []
					for tag in tags:
						if tag == '\n':
							continue
						else:
							if len(tag.split(u'\u3001')) == 1:
								new_tags += tag.split(',')
							else:
								new_tags += tag.split(u'\u3001')
							
					
					tags_string = '"' + '", "'.join(new_tags) + '"'
					
					thumbnail = item.xpath('Resource/thumbnail/@resource').extract()
					locality = item.xpath('Resource/spatial/Description/label/text()').extract()

					if not locality:
						newloc = ''
					else:
						newloc = locality[0]

					if not thumbnail:
						newthumb = ''
					else:
						if thumbnail[0] == "http://fukushima.archive-disasters.jp/images/file_normal_icon.jpg":
							newthumb = ''
						else:
							newthumb = thumbnail[0]
						
					if not title:
						title = ''
					else:
						title = title[0]
					
					if not uri:
						uri = ''
					else:
						uri = uri[0]
						
					if not attribution_uri:
						attribution_uri = ''
					else:
						attribution_uri = attribution_uri[0]
					
					if not creator:
						creator = ""
					else:
						creator = creator[0]
					
					
					if newthumb == '':
						thumb = ''
					else:
						thumb = '", "thumbnail_url": "' + newthumb

					json_entry = '{"title": "' + title + '", "description": "", "uri": "' + uri + '", "attribution_uri": "' + attribution_uri + '", "media_creator_username": "******", "location": "' + newloc + '", "media_date_created": "' + media_date_created + '", "tags": [' + tags_string + '], "archive":"Fukushima Disaster Archives", "media_type": "' + media_type + '", "layer_type": "' + layer_type + '", "child_items_count":0, "published":1}, '


					jsons.append(json_entry)
                        

                resumptionToken = x.xpath('//resumptionToken/text()').extract()
                if resumptionToken == []:
					print "FINISHED"
					nextFileLink = ''
					open('last.txt', 'wb').write(''.join(jsons).encode("UTF-8"))
                else:
					nextFileLink = "http://fukushima.archive-disasters.jp/infolib/oai_repository/repository?verb=ListRecords&resumptionToken=" + resumptionToken[0].encode('ascii')
					filename = resumptionToken[0].replace('!', '').replace(':','')
					open(filename.encode('ascii') + '.txt', 'wb').write(''.join(jsons).encode("UTF-8"))
                yield Request(nextFileLink, callback = self.parse)
コード例 #23
0
''', re.X)

conn = db_settings.con()
c = conn.cursor()
county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1]
county = common.county_abbr2string(county_abbr)
election_year = common.election_year(county)
county_abbr3 = common.county2abbr3(county)
for file in glob.glob(u'../../../data/%s/meeting_minutes/%s/html/*.html' %
                      (county_abbr, election_year)):
    with codecs.open(file, 'r', encoding='utf-8') as f:
        print f.name
        fileName, fileExt = os.path.splitext(os.path.basename(f.name))
        xml_text = unicodedata.normalize('NFC', f.read())
        x = Selector(text=xml_text, type='html')
        x.remove_namespaces()
        year = int(x.xpath('//text()').re(u'表\s*[((](\d+)/\d+/\d+')[0]) + 1911
        d = {}
        for table in x.xpath(u'//table'):
            days = table.xpath('descendant::tr[1]/td//text()').re('(\d+/\d+)')
            dates = [
                '%d-%02d-%02d' %
                (year, int(day.split('/')[0]), int(day.split('/')[1]))
                for day in days
            ]
            for tr in table.xpath(
                    'descendant::tr[td[1][re:test(., "^\d+$")]]'):
                name = re.sub(u'[.﹒]', u'‧',
                              tr.xpath('td[2]//text()').extract_first() or '')
                if not name:
                    continue
コード例 #24
0
ファイル: simplelist.py プロジェクト: deepak64/scraper-test
    def parse_job_index(self, response):
        '''
        Parses the job list page using the rules from `self.parse_job_index_rules`

        base - xpath that selects the job item group
        joburl - rule that returns the url from the job item group
        relative - set of rules that return the specified fields from the job item group.
        nonrelative - set of rules that return the specified fields from the page that are outside the job item group but are in the same order as them.
        formdata - rule that return the formdata for the job request.
        dont_filter/dont_click/from_response - rules that determine whether to enable these options for the job request.

        ex.

        parse_job_index_rules = [
            {
                'base': xpath,
                'joburl': {
                    'xpath': [
                        {'xpaths': ['xpath1', 'xpath2', 'xpath3'], 'processors': [proc1, proc2], 're': regex},
                    ],
                    'function': foo,
                    'value': {
                        'values': 'foo', 'processors': [proc1, proc2, proc3], 're': regex
                    }
                },
                'relative': {
                    'title': {
                        'xpath': [
                            {'xpaths': ['xpath1', 'xpath2', 'xpath3'], 'processors': [proc1, proc2], 're': regex},
                        ],
                        'function': foo,
                        'value': {
                            'values': 'foo', 'processors': [proc1, proc2, proc3], 're': regex
                        }
                    },
                    ...
                    ...
                },
                'nonrelative': {
                    'title': {
                        'xpath': [
                            {'xpaths': ['xpath1', 'xpath2', 'xpath3'], 'processors': [proc1, proc2], 're': regex},
                        ],
                        'function': foo,
                        'value': {
                            'values': 'foo', 'processors': [proc1, proc2, proc3], 're': regex
                        }
                    },
                    ...
                    ...
                },
                'formdata': {
                    'xpath': [
                        {'xpaths': ['xpath1', 'xpath2', 'xpath3'], 'processors': [proc1, proc2], 're': regex},
                    ],
                    'function': foo,
                    'value': {
                        'values': 'foo', 'processors': [proc1, proc2, proc3], 're': regex
                    }
                },
                'dont_filter': {'static': False},
                'dont_click': {'static': False},
                'from_response': {'static': False}
            }
        ]
        '''
        sel = Selector(response)
        sel.remove_namespaces()

        loader = ItemLoader(selector=sel)

        for i, r in enumerate(arg_to_iter(self.parse_job_index_rules)):
            jobs = sel.xpath(r.get('base'))
            if jobs:
                self.ruleindex = i
                rule = r
                break
        else:
            rule = {}

        customitems = []

        for count in rule.get('jobcount', {}):
            self.extract_from_rule(loader, rule.get('jobcount'), response)

        for field in rule.get('nonrelative', {}):
            value = self.extract_from_rule(loader, rule.get('nonrelative').get(field), response)
            customitems.append([(field, val) for val in value])

        for ji, u in enumerate(izip_longest(jobs, *customitems, fillvalue=None)):
            jobloader = ItemLoader(selector=u[0])
            formdata = {}
            meta = {'custom_items': response.meta.get('custom_items', {})}

            for i in xrange(1, len(u)):
                meta['custom_items'][u[i][0]] = u[i][1]

            for field in rule.get('relative', {}):
                meta['custom_items'][field] = self.extract_from_rule(jobloader, rule.get('relative').get(field), response)

            if self.extract_from_rule(loader, rule.get('from_response', {'static': False}), response, jobindex=ji):
                yield FormRequest.from_response(
                    response,
                    formdata=self.extract_from_rule(jobloader, rule.get('formdata'), response, jobindex=ji),
                    callback=self.parse_job_callback(),
                    meta=meta,
                    dont_filter=self.extract_from_rule(loader, rule.get('dont_filter', {'static': False}), response, jobindex=ji),
                    dont_click=self.extract_from_rule(loader, rule.get('dont_click', {'static': False}), response, jobindex=ji)
                )
            else:
                yield FormRequest(
                    url=self.extract_from_rule(jobloader, rule.get('joburl'), response, jobindex=ji),
                    formdata=self.extract_from_rule(jobloader, rule.get('formdata'), response, jobindex=ji),
                    callback=self.parse_job_callback(),
                    meta=meta,
                    dont_filter=self.extract_from_rule(loader, rule.get('dont_filter', {'static': False}), response, jobindex=ji)
                )
コード例 #25
0
    def parse_json_comments(self, response):

        print("==============\nstart parsing json\n===============")

        num = re.compile(
            r'[0-9]+\.?[0-9]*')  # Регулярное выражение для определения числа

        data = json.loads(response.body)
        ut.write_html(self.dest + "comments.html", data['html'])

        html = data['html'].replace('<br>',
                                    '\n')  # Заменяем для целостности комента

        selector = Selector(text=html)
        selector.remove_namespaces()

        output = ""

        # Используем регулярное выражение для получения только полных комментариев
        review_boxes = selector.xpath(
            "//div[re:test(@class, '\Areview_box\s*\Z')]")
        for review in review_boxes:
            output += "\n=======================\n"

            if review.css('div.persona_name') is None:
                continue  # Если такого не существует пропускаем

            persona_name = review.css('div.persona_name')

            if persona_name.css('a::text').extract_first() is None:
                name = "i have to search in span"
                continue
            else:
                name = str(persona_name.css('a::text').extract_first())

            if persona_name.css('a::attr(href)').extract_first() is None:
                url = "have to search in another place"
                continue
            else:
                url = str(persona_name.css('a::attr(href)').extract_first())

            if url != "None" and url is not None:
                person_id = url.split('/')[-2]
            else:
                person_id = "Doesn't exist"

            if review.css(
                    'div.num_owned_games a::text').extract_first() is None:
                num_owned_games = "Didn't find"
                continue
            else:
                num_owned_games = str(
                    review.css('div.num_owned_games a::text').extract_first()
                ).split(' ')[-1]
                num_owned_games = num_owned_games.replace(',', '')
                num_owned_games = num_owned_games.replace('.', '')

            if review.css('div.num_reviews a::text').extract_first() is None:
                num_reviews = "Didn't find"
                continue
            else:
                num_reviews_text = review.css(
                    'div.num_reviews a::text').extract_first().strip()
                if num.match(num_reviews_text):
                    num_reviews = (num.findall(num_reviews_text))[0].strip()
                    num_reviews = num_reviews.replace(',', '')
                    num_reviews = num_reviews.replace('.', '')
                else:
                    num_reviews = "0"

            if review.xpath('.//div[contains(@class, "title ellipsis")]/text()'
                            ).extract_first() is None:
                grade = "Didn't find"
                continue
            else:
                grade = review.xpath(
                    './/div[contains(@class, "title ellipsis")]/text()'
                ).extract_first()
                if grade == "Рекомендую":
                    grade = "1"
                else:
                    grade = "0"

            if review.xpath('.//div[contains(@class, "hours ellipsis")]/text()'
                            ).extract_first() is None:
                hours = "Didn't find"
                continue
            else:
                hours = review.xpath(
                    './/div[contains(@class, "hours ellipsis")]/text()'
                ).extract_first()
                hours = hours.split(' ')[-2].replace('.', '')
                hours = hours.replace(',', '')

            if review.css('div.vote_info::text').extract_first() is None:
                num_useful = "Didn't find"
                num_funny = "Didn't find"
                continue
            else:
                useful = "Not found"
                funny = "Not found"

                num_useful = '0'
                num_funny = '0'

                votes_info = review.css('div.vote_info::text').extract()

                for _ in votes_info:
                    votes = _.splitlines()
                    for vote in votes:
                        if 'полезным' in vote:
                            useful = vote.strip()
                            num_useful = num.findall(useful)[0].strip()
                        elif 'забавным' in vote:
                            funny = vote.strip()
                            num_funny = num.findall(funny)[0].strip()

            if review.css('div.content::text').extract_first() is None:
                text = "None"
                continue
            else:
                text = review.css('div.content::text').extract_first()

            num_reviews = num.findall(num_reviews_text)[0]

            output += "Name\tis:\t{}\n".format(name)
            output += "Url\tis:\t{}\n".format(url)
            output += "Id \tis:\t{}\n".format(person_id)
            output += "Owned games:\t{}\n".format(num_owned_games)
            output += "Num reviews:\t{}\n".format(num_reviews)
            output += "Grade\tis:\t{}\n".format(grade)
            output += "Ingame hours:\t{}\n".format(hours)

            output += "People think it helpful:\t{}\n".format(num_useful)
            output += "People think it funny:\t\t{}\n".format(num_funny)

            # output += "Text:\n{}\n".format(text)

            Comments.add_comment(Comments, text, num_owned_games, num_reviews,
                                 grade, hours, num_useful, num_funny)

            output += "=======================\n"

        ut.write_html(self.dest + "reviewers.txt", output)

        # output = ""
        # comments = selector.css('div.review_box').css('div.content::text').extract()
        # for comment in comments:
        #     comment = comment.strip()
        #     if not comment:
        #         continue    # Пропускаем если строчка пустая
        #     output += "\n=============================\n"
        #     output += comment
        #     output += "\n=============================\n"

        # ut.write_html(self.dest + 'comments.txt', output)

        print("==============\nended parsing json\n===============")