def parse(self, response): xml = Selector(response) xml.remove_namespaces() urls = xml.xpath('//loc/text()').extract() for url in urls: yield scrapy.Request(response.urljoin(url), callback=self.parse_hotel)
def parse(self, response): xml = Selector(response) xml.remove_namespaces() urls = xml.xpath('//loc/text()').extract() for url in urls: yield scrapy.Request(url, callback=self.parse_store)
def _extract_links(self, response): type = 'html' if response.body_as_unicode().strip().startswith('<?xml version='): type = 'xml' xxs = Selector(response, type=type) if self.remove_namespaces: xxs.remove_namespaces() for url in xxs.xpath(self.xpath).extract(): yield Link(url.encode(response.encoding))
def parse(self, response): sel = Selector(response) sel.remove_namespaces() mbid = sel.xpath("//release-group[1]/@id").extract() if mbid: yield Request("http://coverartarchive.org/release-group/" + mbid[0], callback=self.parse_album, meta=response.meta)
def parse(self, response): xml = Selector(response) xml.remove_namespaces() urls = xml.xpath('//loc/text()').extract() for url in urls: path = "/".join(urlparse(url).path.split('/')[:-1]) yield scrapy.Request(response.urljoin(path), callback=self.parse_hotel)
def parse(self, response): xml = Selector(response) xml.remove_namespaces() urls = xml.xpath('//loc/text()').extract() for url in urls: yield scrapy.Request(url, callback=self.parse_store, headers=DEFAULT_HEADERS, meta=response.meta)
def parse_item(self, response): #self.log('Crawling the link: %s' % response.url) sel = Selector(response) sel.remove_namespaces() topic = sel.xpath("//div[@id='SHOW_TOPIC']") item = TripAdvisorForumPostItem() item['text'] = topic.xpath("//div[@class='postBody']").extract() #topic.xpath("//p/text()").extract() #item['date'] = topic.xpath("//div[@class='postDate']/text()").extract() item['link'] = response.url return item
def parse(self, response): xml = Selector(response) xml.remove_namespaces() urls = xml.xpath("//loc/text()").extract() urls = [url.strip() for url in urls] for url in urls: path = scrapy.utils.url.parse_url(url).path if re.match(r"^/.*/.*/.*$", path): yield scrapy.Request(url, callback=self.parse_location)
def parse(self, response): xml = Selector(response) xml.remove_namespaces() urls = xml.xpath('//loc/text()').extract() for url in urls: if re.match(r'^https://99only.com/stores/.+?$', url): if 'near-me' in url: continue yield scrapy.Request(url, callback=self.parse_location)
def parse(self, response): xml = Selector(response) xml.remove_namespaces() urls = xml.xpath('//loc/text()').extract() urls = [url.strip() for url in urls] for url in urls: if '/home/' in url: continue yield scrapy.Request(url, callback=self.parse_state_sitemap)
def _extract_links(self, response): body = response.body_as_unicode() _type = 'html' if body.lstrip().startswith('<?xml version='): _type = 'xml' xxs = Selector(text=body, type=_type) if self.remove_namespaces: xxs.remove_namespaces() for url in xxs.xpath(self.xpath).extract(): if not isinstance(url, six.text_type): url = url.encode(response.encoding) yield Link(url)
def parse_state_sitemap(self, response): xml = Selector(response) xml.remove_namespaces() urls = xml.xpath('//loc/text()').extract() urls = [url.strip() for url in urls] for url in urls: # store urls follow this pattern: # url must be 3 segments (state/city/address) and the last one cannot be a postalcode only if re.match(r'^https://pizza.dominos.com/.*?/.*?/(?!(\d{5}/)).*?/$', url): yield scrapy.Request(url, callback=self.parse_place)
def parse(self, response): xml = Selector(response) xml.remove_namespaces() urls = xml.xpath('//loc/text()').extract() urls = [url.strip() for url in urls] for url in urls: if re.search(r'orlando-university', url): pass elif re.search(r'locations/', url): yield scrapy.Request(url, callback=self.parse_location)
def parse_state_sitemap(self, response): xml = Selector(response) xml.remove_namespaces() urls = xml.xpath('//loc/text()').extract() urls = [url.strip() for url in urls] # individual store pages are listed at top, then a state page, then bunch of other non-store pages # find the index position of the state page and then only parse urls before that i = urls.index( re.search(r'^(https://locations.rentacenter.com/.+?)/.*$', urls[0]).groups()[0] + '/') for url in urls[:i]: yield scrapy.Request(url, callback=self.parse_location)
def parse(self, response): content_selector = Selector(text=response.body) content_selector.register_namespace('arxiv', 'http://arxiv.org/schemas/atom') content_selector.register_namespace('xmlns', 'http://www.w3.org/2005/Atom') content_selector.remove_namespaces() for line in content_selector.xpath('//feed/entry'): item = TestItem() item['id'] = line.xpath('id/text()').extract() item['title'] = line.xpath('title/text()').extract() item['links'] = line.xpath('link/@href').extract() item['authors'] = line.xpath('author/name/text()').extract() item['comments'] = line.xpath('comment/text()').extract() item['primary_category'] = line.xpath('primary_category/@term').extract() item['categories'] = line.xpath('category/@term').extract() item['summary'] = line.xpath('summary/text()').extract() yield item
def parse_job_index(self, response): for job in super(ATSSpider2_RecursiveList, self).parse_job_index(response): yield job if self.ruleindex is None: raise CloseSpider('No jobs found.') sel = Selector(response) sel.remove_namespaces() loader = ItemLoader(selector=sel) rule = arg_to_iter(self.parse_job_index_rules)[self.ruleindex].get('recursive') if rule is None: raise CloseSpider('parse_job_index_rules has no recursive rules') if self.extract_from_rule(loader, rule.get('condition')): if self.extract_from_rule(loader, rule.get('from_response', {'static': False})): url = self.extract_from_rule(loader, rule.get('url')) if url: yield FormRequest.from_response( response, url=url, formdata=self.extract_from_rule(loader, rule.get('formdata')), callback=self.parse_job_index, dont_filter=self.extract_from_rule(loader, rule.get('dont_filter', {'static': False})), dont_click=self.extract_from_rule(loader, rule.get('dont_click', {'static': False})) ) else: yield FormRequest.from_response( response, formdata=self.extract_from_rule(loader, rule.get('formdata')), callback=self.parse_job_index, dont_filter=self.extract_from_rule(loader, rule.get('dont_filter', {'static': False})), dont_click=self.extract_from_rule(loader, rule.get('dont_click', {'static': False})) ) else: yield FormRequest( url=self.extract_from_rule(loader, rule.get('url')), formdata=self.extract_from_rule(loader, rule.get('formdata')), callback=self.parse_job_index, dont_filter=self.extract_from_rule(loader, rule.get('dont_filter', {'static': False})) )
def parse_item(self, response): sel = Selector(response) sel.remove_namespaces() reviews = sel.xpath("//div[@class='reviewSelector ']") items =[] for review in reviews: item = TripAdvisorAttractionReviewItem() item['text'] = review.xpath("div[@class='review basic_review inlineReviewUpdate provider0 newFlag']/div[@class='col2of2']/div[@class='innerBubble']/div[@class='wrap']/div[@class='entry']/p/text()").extract() if not item['text']: continue score = review.xpath("div[@class='review basic_review inlineReviewUpdate provider0 newFlag']/div[@class='col2of2']/div[@class='innerBubble']/div[@class='wrap']/div[@class='rating reviewItemInline']/span/img/@alt").extract() if score: regex = re.compile("(\d)") r = regex.findall(str(score))#IE: 4 of 5 stars item['score'] = float(r[0])#then 4 item['max_score'] = float(r[1])#then 5 items.append(item) return items
def parse_journal_issue(self, dataset, target_folder, filename): """Parse journal issue tags and files if there is any in the dataset.xml. The journal issue xmls are containing all the dois for artciles in that issue. Extract this data, and later update it from the journal item xml. """ data = [] for issue in dataset.xpath('//journal-issue'): tmp = { 'volume': "%s %s" % (issue.xpath('//volume-issue-number/vol-first/text()')[0].extract(), issue.xpath('//volume-issue-number/suppl/text()')[0].extract()), } issue_file = os.path.join(target_folder, filename, issue.xpath('./files-info/ml/pathname/text()')[0].extract()) self.log('Parsing journal issue xml: %s' % issue_file, logging.INFO) articles = {} with open(issue_file, 'r') as issue_file: iss = Selector(text=issue_file.read()) iss.remove_namespaces() for article in iss.xpath('//include-item'): doi = article.xpath('./doi/text()')[0].extract() first_page = None if article.xpath('./pages/first-page/text()'): first_page = article.xpath('./pages/first-page/text()')[0].extract() last_page = None if article.xpath('./pages/last-page/text()'): last_page = article.xpath('./pages/last-page/text()')[0].extract() articles[doi] = {'first-page': first_page, 'last-page': last_page} tmp['articles'] = articles data.append(tmp) return data
def handle_package(self, response): """Handle the zip package and yield a request for every XML found.""" import traceback with open(response.meta["local_filename"], 'w') as destination_file: destination_file.write(response.body) filename = os.path.basename( response.url).rstrip("A.tar").rstrip('.zip') # TMP dir to extract zip packages: target_folder = mkdtemp(prefix=filename + "_", dir=ELSEVIER_UNPACK_FOLDER) zip_filepath = response.meta["local_filename"] files = uncompress(zip_filepath, target_folder) # The xml files shouldn't be removed after processing; they will # be later uploaded to Inspire. So don't remove any tmp files here. try: for f in files: if 'dataset.xml' in f: from scrapy.selector import Selector with open(f, 'r') as dataset_file: dataset = Selector(text=dataset_file.read()) data = [] for i, issue in enumerate( dataset.xpath('//journal-issue')): tmp = {} tmp['volume'] = "%s %s" % (issue.xpath( '//volume-issue-number/vol-first/text()' )[0].extract( ), issue.xpath('//volume-issue-number/suppl/text()' )[0].extract()) tmp['issue'] = issue.xpath( '//issn/text()')[0].extract() issue_file = os.path.join( target_folder, filename, issue.xpath('./files-info/ml/pathname/text()') [0].extract()) arts = {} with open(issue_file, 'r') as issue_file: iss = Selector(text=issue_file.read()) iss.remove_namespaces() for article in iss.xpath('//include-item'): doi = article.xpath( './doi/text()')[0].extract() first_page = None if article.xpath( './pages/first-page/text()'): first_page = article.xpath( './pages/first-page/text()' )[0].extract() last_page = None if article.xpath( './pages/last-page/text()'): last_page = article.xpath( './pages/last-page/text()' )[0].extract() arts[doi] = { 'files': { 'xml': None, 'pdf': None }, 'first-page': first_page, 'last-page': last_page } tmp['articles'] = arts data.append(tmp) tmp_empty_data = 0 if not data: tmp_empty_data = 1 data.append({ 'volume': None, 'issue': None, 'articles': {} }) for article in dataset.xpath('//journal-item'): doi = article.xpath( './journal-item-unique-ids/doi/text()' )[0].extract() if article.xpath( './journal-item-properties/online-publication-date/text()' ): publication_date = article.xpath( './journal-item-properties/online-publication-date/text()' )[0].extract()[:18] else: publication_date = datetime.datetime.now( ).strftime("%Y-%m-%dT%H:%M:%S") journal = article.xpath( './journal-item-unique-ids/jid-aid/jid/text()' )[0].extract() if journal == "PLB": journal = "Physics Letters B" if journal == "NUPHB": journal = "Nuclear Physics B" if tmp_empty_data: data[0]['articles'][doi] = { 'files': { 'xml': None, 'pdf': None }, 'first-page': None, 'last-page': None, } for i, issue in enumerate(data): if doi in data[i]['articles']: data[i]['articles'][doi][ 'journal'] = journal data[i]['articles'][doi][ 'publication-date'] = publication_date xml = os.path.join( target_folder, filename, article.xpath( './files-info/ml/pathname/text()') [0].extract()) pdf = os.path.join( target_folder, filename, article.xpath( './files-info/web-pdf/pathname/text()' )[0].extract()) data[i]['articles'][doi]['files'][ 'xml'] = xml data[i]['articles'][doi]['files'][ 'pdf'] = pdf if 'vtex' in zip_filepath: pdfa = os.path.join( os.path.split(pdf)[0], 'main_a-2b.pdf') pdfa = os.path.join( target_folder, pdfa) data[i]['articles'][doi]['files'][ 'pdfa'] = pdfa for i, issue in enumerate(data): print('a') for doi in data[i]['articles']: print('b') try: print('try') xml_file = open( data[i]['articles'][doi]['files'] ['xml'], 'r') print(xml_file) xml_file_content = xml_file.read() for nodename in self.itertag: print(nodename) for selector in xmliter( xml_file_content, nodename): print(selector) yield self.parse_node( data[i], doi, zip_filepath, selector) except: print(traceback.print_exc()) except: import traceback traceback.print_exc()
def parse(self, response): sel = Selector(response) sel.remove_namespaces() mbid = sel.xpath("//release-group[1]/@id").extract() if mbid: yield Request("http://coverartarchive.org/release-group/"+mbid[0],callback=self.parse_album,meta=response.meta)
def parse(self, response): x = Selector(response) x.remove_namespaces() items = [] items = x.xpath('//record/metadata/RDF') jsons = [] for item in items: media_type = "" layer_type = "" # get ending of "alternative" title, which gives file extension file_ext = "" if not item.xpath('Resource/alternative/Description/value/text()').extract() == []: file_ext = item.xpath('Resource/alternative/Description/value/text()').extract()[0].split('.')[-1] # we don't want these if file_ext == "cvs" or file_ext == "CVS" or file_ext == "xls" or file_ext == "XLS" or file_ext == "txt" or file_ext == "TXT": continue; if file_ext == "pdf" or file_ext == "PDF" or file_ext == "doc" or file_ext == "DOC" or file_ext == "docx" or file_ext == "DOCX" or file_ext == "rtf" or file_ext == "RTF": media_type = "Document" layer_type = file_ext.upper() if file_ext == "jpg" or file_ext == "JPG" or file_ext == "jpeg" or file_ext == "JPEG" or file_ext == "png" or file_ext == "PNG": media_type = "Image" layer_type = "Image" if file_ext == "mp3" or file_ext == "MP3" or file_ext == "wav" or file_ext == "WAV" or file_ext == "m4a" or file_ext == "M4A" or file_ext == "wma" or file_ext == "WMA": media_type = "Audio" layer_type = "Audio" if file_ext == "mp4" or file_ext == "MP4" or file_ext == "mpeg" or file_ext == "MPEG" or file_ext == "wmv" or file_ext == "WMV" or file_ext=="AVI" or file_ext == "avi": media_type = "Video" layer_type = "Video" # NDL link format = item.xpath('Resource/materialType/@resource').extract()[0] if media_type == "" and layer_type == "": if format == "http://purl.org/dc/dcmitype/StillImage" or format == "http://ndl.go.jp/ndltype/Photograph": media_type = "Image" layer_type = "Image" if format == "http://purl.org/dc/dcmitype/MovingImage": media_type = "Video" layer_type = "Video" if format == "http://purl.org/dc/dcmitype/Sound": media_type = "Audio" layer_type = "Audio" #Still Nothing if media_type == "" and layer_type == "": if not item.xpath('Resource/accessURL/@resource').extract() == []: file_ext = item.xpath('Resource/accessURL/@resource').extract()[0].split('.')[-1] # we don't want these if file_ext == "cvs" or file_ext == "CVS" or file_ext == "xls" or file_ext == "XLS" or file_ext == "txt" or file_ext == "TXT": continue; if file_ext == "pdf" or file_ext == "PDF" or file_ext == "doc" or file_ext == "DOC" or file_ext == "docx" or file_ext == "DOCX" or file_ext == "rtf" or file_ext == "RTF": media_type = "Document" layer_type = file_ext.upper() if file_ext == "jpg" or file_ext == "JPG" or file_ext == "jpeg" or file_ext == "JPEG" or file_ext == "png" or file_ext == "PNG": media_type = "Image" layer_type = "Image" if file_ext == "mp3" or file_ext == "MP3" or file_ext == "wav" or file_ext == "WAV" or file_ext == "m4a" or file_ext == "M4A" or file_ext == "wma" or file_ext == "WMA": media_type = "Audio" layer_type = "Audio" if file_ext == "mp4" or file_ext == "MP4" or file_ext == "mpeg" or file_ext == "MPEG" or file_ext == "wmv" or file_ext == "WMV" or file_ext=="AVI" or file_ext == "avi": media_type = "Video" layer_type = "Video" creator = item.xpath('Resource/publisher/Agent/name/text()').extract() title = item.xpath('Resource/title/Description/value/text()').extract() uri = item.xpath('Resource/@about').extract() attribution_uri = item.xpath('MetaResource/@about').extract() date_created = item.xpath('MetaResource/created/text()').extract() access_url = item.xpath('Resource/accessURL/@resource').extract() if media_type == "Image" or media_type == "Audio" or media_type == "Video" or media_type == "Document": if not access_url == []: uri = access_url if not date_created: media_date_created = "0000-00-00 00:00:00" else: date = date_created[0].split('T')[0] time = date_created[0].split('T')[1].split('+')[0] media_date_created = date + ' ' + time tags = item.xpath('Resource/subject/text()').extract() tags_string = "" new_tags = [] for tag in tags: if tag == '\n': continue else: if len(tag.split(u'\u3001')) == 1: new_tags += tag.split(',') else: new_tags += tag.split(u'\u3001') tags_string = '"' + '", "'.join(new_tags) + '"' thumbnail = item.xpath('Resource/thumbnail/@resource').extract() locality = item.xpath('Resource/spatial/Description/label/text()').extract() if not locality: newloc = '' else: newloc = locality[0] if not thumbnail: newthumb = '' else: if thumbnail[0] == "http://fukushima.archive-disasters.jp/images/file_normal_icon.jpg": newthumb = '' else: newthumb = thumbnail[0] if not title: title = '' else: title = title[0] if not uri: uri = '' else: uri = uri[0] if not attribution_uri: attribution_uri = '' else: attribution_uri = attribution_uri[0] if not creator: creator = "" else: creator = creator[0] if newthumb == '': thumb = '' else: thumb = '", "thumbnail_url": "' + newthumb json_entry = '{"title": "' + title + '", "description": "", "uri": "' + uri + '", "attribution_uri": "' + attribution_uri + '", "media_creator_username": "******", "location": "' + newloc + '", "media_date_created": "' + media_date_created + '", "tags": [' + tags_string + '], "archive":"Fukushima Disaster Archives", "media_type": "' + media_type + '", "layer_type": "' + layer_type + '", "child_items_count":0, "published":1}, ' jsons.append(json_entry) resumptionToken = x.xpath('//resumptionToken/text()').extract() if resumptionToken == []: print "FINISHED" nextFileLink = '' open('last.txt', 'wb').write(''.join(jsons).encode("UTF-8")) else: nextFileLink = "http://fukushima.archive-disasters.jp/infolib/oai_repository/repository?verb=ListRecords&resumptionToken=" + resumptionToken[0].encode('ascii') filename = resumptionToken[0].replace('!', '').replace(':','') open(filename.encode('ascii') + '.txt', 'wb').write(''.join(jsons).encode("UTF-8")) yield Request(nextFileLink, callback = self.parse)
''', re.X) conn = db_settings.con() c = conn.cursor() county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1] county = common.county_abbr2string(county_abbr) election_year = common.election_year(county) county_abbr3 = common.county2abbr3(county) for file in glob.glob(u'../../../data/%s/meeting_minutes/%s/html/*.html' % (county_abbr, election_year)): with codecs.open(file, 'r', encoding='utf-8') as f: print f.name fileName, fileExt = os.path.splitext(os.path.basename(f.name)) xml_text = unicodedata.normalize('NFC', f.read()) x = Selector(text=xml_text, type='html') x.remove_namespaces() year = int(x.xpath('//text()').re(u'表\s*[((](\d+)/\d+/\d+')[0]) + 1911 d = {} for table in x.xpath(u'//table'): days = table.xpath('descendant::tr[1]/td//text()').re('(\d+/\d+)') dates = [ '%d-%02d-%02d' % (year, int(day.split('/')[0]), int(day.split('/')[1])) for day in days ] for tr in table.xpath( 'descendant::tr[td[1][re:test(., "^\d+$")]]'): name = re.sub(u'[.﹒]', u'‧', tr.xpath('td[2]//text()').extract_first() or '') if not name: continue
def parse_job_index(self, response): ''' Parses the job list page using the rules from `self.parse_job_index_rules` base - xpath that selects the job item group joburl - rule that returns the url from the job item group relative - set of rules that return the specified fields from the job item group. nonrelative - set of rules that return the specified fields from the page that are outside the job item group but are in the same order as them. formdata - rule that return the formdata for the job request. dont_filter/dont_click/from_response - rules that determine whether to enable these options for the job request. ex. parse_job_index_rules = [ { 'base': xpath, 'joburl': { 'xpath': [ {'xpaths': ['xpath1', 'xpath2', 'xpath3'], 'processors': [proc1, proc2], 're': regex}, ], 'function': foo, 'value': { 'values': 'foo', 'processors': [proc1, proc2, proc3], 're': regex } }, 'relative': { 'title': { 'xpath': [ {'xpaths': ['xpath1', 'xpath2', 'xpath3'], 'processors': [proc1, proc2], 're': regex}, ], 'function': foo, 'value': { 'values': 'foo', 'processors': [proc1, proc2, proc3], 're': regex } }, ... ... }, 'nonrelative': { 'title': { 'xpath': [ {'xpaths': ['xpath1', 'xpath2', 'xpath3'], 'processors': [proc1, proc2], 're': regex}, ], 'function': foo, 'value': { 'values': 'foo', 'processors': [proc1, proc2, proc3], 're': regex } }, ... ... }, 'formdata': { 'xpath': [ {'xpaths': ['xpath1', 'xpath2', 'xpath3'], 'processors': [proc1, proc2], 're': regex}, ], 'function': foo, 'value': { 'values': 'foo', 'processors': [proc1, proc2, proc3], 're': regex } }, 'dont_filter': {'static': False}, 'dont_click': {'static': False}, 'from_response': {'static': False} } ] ''' sel = Selector(response) sel.remove_namespaces() loader = ItemLoader(selector=sel) for i, r in enumerate(arg_to_iter(self.parse_job_index_rules)): jobs = sel.xpath(r.get('base')) if jobs: self.ruleindex = i rule = r break else: rule = {} customitems = [] for count in rule.get('jobcount', {}): self.extract_from_rule(loader, rule.get('jobcount'), response) for field in rule.get('nonrelative', {}): value = self.extract_from_rule(loader, rule.get('nonrelative').get(field), response) customitems.append([(field, val) for val in value]) for ji, u in enumerate(izip_longest(jobs, *customitems, fillvalue=None)): jobloader = ItemLoader(selector=u[0]) formdata = {} meta = {'custom_items': response.meta.get('custom_items', {})} for i in xrange(1, len(u)): meta['custom_items'][u[i][0]] = u[i][1] for field in rule.get('relative', {}): meta['custom_items'][field] = self.extract_from_rule(jobloader, rule.get('relative').get(field), response) if self.extract_from_rule(loader, rule.get('from_response', {'static': False}), response, jobindex=ji): yield FormRequest.from_response( response, formdata=self.extract_from_rule(jobloader, rule.get('formdata'), response, jobindex=ji), callback=self.parse_job_callback(), meta=meta, dont_filter=self.extract_from_rule(loader, rule.get('dont_filter', {'static': False}), response, jobindex=ji), dont_click=self.extract_from_rule(loader, rule.get('dont_click', {'static': False}), response, jobindex=ji) ) else: yield FormRequest( url=self.extract_from_rule(jobloader, rule.get('joburl'), response, jobindex=ji), formdata=self.extract_from_rule(jobloader, rule.get('formdata'), response, jobindex=ji), callback=self.parse_job_callback(), meta=meta, dont_filter=self.extract_from_rule(loader, rule.get('dont_filter', {'static': False}), response, jobindex=ji) )
def parse_json_comments(self, response): print("==============\nstart parsing json\n===============") num = re.compile( r'[0-9]+\.?[0-9]*') # Регулярное выражение для определения числа data = json.loads(response.body) ut.write_html(self.dest + "comments.html", data['html']) html = data['html'].replace('<br>', '\n') # Заменяем для целостности комента selector = Selector(text=html) selector.remove_namespaces() output = "" # Используем регулярное выражение для получения только полных комментариев review_boxes = selector.xpath( "//div[re:test(@class, '\Areview_box\s*\Z')]") for review in review_boxes: output += "\n=======================\n" if review.css('div.persona_name') is None: continue # Если такого не существует пропускаем persona_name = review.css('div.persona_name') if persona_name.css('a::text').extract_first() is None: name = "i have to search in span" continue else: name = str(persona_name.css('a::text').extract_first()) if persona_name.css('a::attr(href)').extract_first() is None: url = "have to search in another place" continue else: url = str(persona_name.css('a::attr(href)').extract_first()) if url != "None" and url is not None: person_id = url.split('/')[-2] else: person_id = "Doesn't exist" if review.css( 'div.num_owned_games a::text').extract_first() is None: num_owned_games = "Didn't find" continue else: num_owned_games = str( review.css('div.num_owned_games a::text').extract_first() ).split(' ')[-1] num_owned_games = num_owned_games.replace(',', '') num_owned_games = num_owned_games.replace('.', '') if review.css('div.num_reviews a::text').extract_first() is None: num_reviews = "Didn't find" continue else: num_reviews_text = review.css( 'div.num_reviews a::text').extract_first().strip() if num.match(num_reviews_text): num_reviews = (num.findall(num_reviews_text))[0].strip() num_reviews = num_reviews.replace(',', '') num_reviews = num_reviews.replace('.', '') else: num_reviews = "0" if review.xpath('.//div[contains(@class, "title ellipsis")]/text()' ).extract_first() is None: grade = "Didn't find" continue else: grade = review.xpath( './/div[contains(@class, "title ellipsis")]/text()' ).extract_first() if grade == "Рекомендую": grade = "1" else: grade = "0" if review.xpath('.//div[contains(@class, "hours ellipsis")]/text()' ).extract_first() is None: hours = "Didn't find" continue else: hours = review.xpath( './/div[contains(@class, "hours ellipsis")]/text()' ).extract_first() hours = hours.split(' ')[-2].replace('.', '') hours = hours.replace(',', '') if review.css('div.vote_info::text').extract_first() is None: num_useful = "Didn't find" num_funny = "Didn't find" continue else: useful = "Not found" funny = "Not found" num_useful = '0' num_funny = '0' votes_info = review.css('div.vote_info::text').extract() for _ in votes_info: votes = _.splitlines() for vote in votes: if 'полезным' in vote: useful = vote.strip() num_useful = num.findall(useful)[0].strip() elif 'забавным' in vote: funny = vote.strip() num_funny = num.findall(funny)[0].strip() if review.css('div.content::text').extract_first() is None: text = "None" continue else: text = review.css('div.content::text').extract_first() num_reviews = num.findall(num_reviews_text)[0] output += "Name\tis:\t{}\n".format(name) output += "Url\tis:\t{}\n".format(url) output += "Id \tis:\t{}\n".format(person_id) output += "Owned games:\t{}\n".format(num_owned_games) output += "Num reviews:\t{}\n".format(num_reviews) output += "Grade\tis:\t{}\n".format(grade) output += "Ingame hours:\t{}\n".format(hours) output += "People think it helpful:\t{}\n".format(num_useful) output += "People think it funny:\t\t{}\n".format(num_funny) # output += "Text:\n{}\n".format(text) Comments.add_comment(Comments, text, num_owned_games, num_reviews, grade, hours, num_useful, num_funny) output += "=======================\n" ut.write_html(self.dest + "reviewers.txt", output) # output = "" # comments = selector.css('div.review_box').css('div.content::text').extract() # for comment in comments: # comment = comment.strip() # if not comment: # continue # Пропускаем если строчка пустая # output += "\n=============================\n" # output += comment # output += "\n=============================\n" # ut.write_html(self.dest + 'comments.txt', output) print("==============\nended parsing json\n===============")