def _parse_handler(self, response): self.logger.info("parse url---:{0}".format(response.url)) self.driver.get(response.url) # Selector allows you to select parts of an XML or HTML text using CSS # or XPath expressions and extract data from it. selector = Selector(text=self.driver.page_source) print(selector.extract())
def name_vn(cls, input_value): if isinstance(input_value, list): input_value = ''.join(input_value) name_1 = Selector(text=input_value).xpath("//span[@itemprop='title']//text()").extract() try: name_1 = str(''.join(name_1)) name = Selector(text=input_value).xpath( "//h1//text()") if len(name.re("(.*-.*-.*)")) > 0: try: name_en = str(''.join(name.re("(.*-.*-.*)"))) return except UnicodeEncodeError: return ''.join(name.re("(.*-.*-.*)")) elif len(name.re("(.*)-")) > 0: name_vn = name.re("(.*)-") return name_vn else: name = name.extract() if isinstance(name, list): name = ''.join(name) try: name = str(name) return except UnicodeEncodeError: return name except UnicodeEncodeError: return ''.join(name_1) else: return
def parse(self, response): self.driver = webdriver.Firefox( executable_path=settings.get('WEB_DRIVER_PATH')) self.driver.get(response.request.url) if hasattr(self, 'page_loaded_flag'): WebDriverWait(self.driver, settings.get('DRIVER_TIMEOUT')).until( EC.visibility_of_element_located( (By.CLASS_NAME, self.page_loaded_flag))) if 'select_form' in self.xpath: select_form = self.xpath.get('select_form') WebDriverWait(self.driver, settings.get('DRIVER_TIMEOUT')).until( EC.visibility_of_element_located((By.XPATH, select_form))) self.driver.find_element_by_xpath(select_form).click() if hasattr(self, 'post_load_content_flag'): WebDriverWait(self.driver, settings.get('DRIVER_TIMEOUT')).until( EC.presence_of_element_located( (By.CLASS_NAME, self.post_load_content_flag))) selector = Selector(text=self.driver.page_source) if 'block_xpath' in self.xpath: block_xpath = self.xpath.get('block_xpath') elements_xpath = self.xpath.get('elements_xpath') WebDriverWait(self.driver, settings.get('DRIVER_TIMEOUT')).until( EC.visibility_of_element_located((By.XPATH, block_xpath))) team_link = Selector(text=selector.xpath( self.xpath.get('block_xpath')).extract_first()) WebDriverWait(self.driver, settings.get('DRIVER_TIMEOUT')).until( EC.visibility_of_element_located((By.XPATH, elements_xpath))) data = team_link.xpath(self.xpath.get('elements_xpath')) else: elements_xpath = self.xpath.get('elements_xpath') WebDriverWait(self.driver, settings.get('DRIVER_TIMEOUT')).until( EC.presence_of_element_located((By.XPATH, elements_xpath))) data = selector.xpath(elements_xpath) result = dict() for team_link in data: row = Selector(text=team_link.extract()) name = row.xpath( self.xpath.get('element_xpath_name')).extract_first() coeff = row.xpath( self.xpath.get('element_xpath_rate')).extract_first() if not coeff or not name: continue name = name.strip() result[name] = coeff.strip('\n').replace('\t', '').replace('\n', '') self.driver.quit() yield result
def name_en(cls, input_value): try: if isinstance(input_value, list): input_value = ''.join(input_value) name = Selector(text=input_value).xpath( "//h1//text()") if len(name.re("-.*-(.*)")) > 0: try: name_en = str(''.join(name.re("-.*-(.*)"))) return name_en except UnicodeEncodeError: return elif len(name.re("-(.*)")) > 0: name_en = name.re("-(.*)") return name_en else: name = name.extract() if isinstance(name, list): name = ''.join(name) try: name = str(name) return name except UnicodeEncodeError: return else: return except Exception as e: print e
def parse(self, response): car_id = response.meta['car_id'] # pattern = re.compile(r"var option = ({.*}?);", re.MULTILINE | re.DOTALL) autohome_car = Selector(response).xpath( '//script[contains(., "var option")]/text()') if autohome_car: autohome_car_content = autohome_car.extract()[0] utils.save_autohome(car_id, autohome_car_content)
def parseDetail(self, response): item = response.meta['item'] selector = Selector(response) publishingHouse = re.search('<span class="pl">出版社:</span>(.*?)<br', selector.extract(), re.S).group(1).replace( ' ', '').replace('\n', '') publishingTime = re.search('<span class="pl">出版年:</span>(.*?)<br', selector.extract(), re.S).group(1).replace( ' ', '').replace('\n', '') price = re.search('<span class="pl">定价:</span>(.*?)<br', selector.extract(), re.S).group(1).replace(' ', '').replace('\n', '') item["publishingHouse"] = publishingHouse item["publishingTime"] = publishingTime item["price"] = price yield item
def _get_track_data(self, track_block): track_block = Selector(text=track_block.extract()) return { "artist": track_block.xpath(self._locator_dictionary["artist_name"]).get(), "track": track_block.xpath(self._locator_dictionary["track_name"]).get(), }
def extractMixture(self, block: Selector): text = block.extract() text = text.replace('\n', '') text = DD_REGEX.sub("", text) text = HREF_REGEX.sub("\\1", text) text = TAG_REGEX.sub("", text) text = text.strip() # print(text) return text
def simple_judge(html): """ judge if it is a tv series :param html: html source :return: bool result """ info_sel = Selector(text=html).xpath('//div[@id="info"]') info = info_sel.extract()[0] if u'集数' in info: return 1 return 0
def extract_next_entry_url(domain, entry_source): navbar_xpath = "/html/body/table//tr[4]/td/table/tr//td[2]//table[1]//tr[1]//td[1]" next_entry_url = Selector(text=entry_source).xpath(navbar_xpath + "//*[contains(text(), 'Next')]") if next_entry_url: next_entry_url = next_entry_url.extract()[0] next_entry_url = next_entry_url[str.find(next_entry_url, "id", 0, len(next_entry_url)):len(next_entry_url)] next_entry_url = next_entry_url[0:str.find(next_entry_url, ">")] next_entry_url = str.strip(next_entry_url, "\"") next_entry_url = "entry.cfm?" + next_entry_url next_entry_url = domain + next_entry_url return next_entry_url else: return None
def extract_entry_trip_mileage(entry_source): trip_info_xpath = "/html/body/table//tr[4]/td/table/tr//td[2]//table[1]//tr[3]" entry_trip_mileage = Selector(text=entry_source).xpath(trip_info_xpath + "//td//span[contains(text(), 'Trip Miles')]/following::span") if entry_trip_mileage: entry_trip_mileage = entry_trip_mileage.extract()[0] mileage_start = entry_trip_mileage.find(">") + len(">") entry_trip_mileage = entry_trip_mileage[mileage_start:len(entry_trip_mileage)] entry_trip_mileage = entry_trip_mileage[0:entry_trip_mileage.find("<")] if entry_trip_mileage != '': return float(entry_trip_mileage) else: return None else: return None
def jd_search(keys): conf = ConfigureParser('./__configure__/configure.xml') output_file = conf.get_configure_by_tag_name('output_file') xpath = conf.get_configure_by_tag_name('xpath') for key in keys: source = get_html(handle_type(key)) try: sel = Selector(text=source).xpath(xpath) num = str(num_trans(sel.extract()[0])) except: num = '0' print handle_type(key), num with open(output_file, 'ab') as f: f.write(key + '\t' + num + '\n')
def extract_entry_start_loc(entry_source): trip_info_xpath = "/html/body/table//tr[4]/td/table/tr//td[2]//table[1]//tr[3]" start_loc = Selector(text=entry_source).xpath(trip_info_xpath + "//td//span[contains(text(), 'Starting Location')]/following::span[1]") if start_loc: start_loc = start_loc.extract()[0] start_loc_start = start_loc.find(">") + len(">") start_loc = start_loc[start_loc_start:len(start_loc)] start_loc = start_loc[0:start_loc.find("<")] if start_loc != '': return start_loc else: return None else: return None
def extract_entry_day_mileage(entry_source): trip_info_xpath = "/html/body/table//tr[4]/td/table/tr//td[2]/table[1]" day_mileage = Selector(text=entry_source).xpath(trip_info_xpath + "//td//span[contains(text(), 'Today')]/following::span") if day_mileage: day_mileage = day_mileage.extract()[0] day_mileage_start = str.find(day_mileage, ">") + len(">") day_mileage = day_mileage[day_mileage_start:len(day_mileage)] day_mileage = day_mileage[0:day_mileage.find("<")] if day_mileage != '': return float(day_mileage) else: return None else: return None
def extract_entry_destination(entry_source): trip_info_xpath = "/html/body/table//tr[4]/td/table/tr//td[2]//table[1]//tr[3]" destination = Selector(text=entry_source).xpath(trip_info_xpath + "//td//span[contains(text(), 'Destination')]/following::span[1]") if destination: destination = destination.extract()[0] destination_start = destination.find(">") + len(">") destination = destination[destination_start:len(destination)] destination = destination[0:destination.find("<")] if destination != '': return destination else: return None else: return None
def extract_first_journal_url(journal_url): domain = "http://www.trailjournals.com/" with contextlib.closing(urlopen(journal_url)) as fp: source = fp.read() first_entry_url_xpath = "/html/body/table//tr[4]/td/table/tr//td[2]/table//tr[1]" first_entry_url = Selector(text=source).xpath(first_entry_url_xpath + "//a[contains(text(), 'First')]") if first_entry_url: first_entry_url = first_entry_url.extract()[0] # Not on the first journal page. Record the first entry url. url_start = first_entry_url.find("href=") + len("href=\"") first_entry_url = first_entry_url[url_start:len(first_entry_url)] first_entry_url = first_entry_url[0:first_entry_url.find("\"")] return domain + first_entry_url # Already on the first journal page. return journal_url
def parse_porn_info(self, response): pornItem = PornItem() selector = Selector(response) _ph_info = re.findall('flashvars_.*?=(.*?);\n', selector.extract()) _ph_info_json = json.loads(_ph_info[0]) duration = _ph_info_json.get('video_duration') pornItem['video_duration'] = duration title = _ph_info_json.get('video_title') pornItem['title'] = title image_url = _ph_info_json.get('image_url') pornItem['image_url'] = image_url url = _ph_info_json.get('link_url') pornItem['url'] = url video_url = _ph_info_json.get('quality_480p') pornItem['video_url'] = video_url yield pornItem
def extract_entry(entry_source): trip_info_xpath = "/html/body/table//tr[4]/td/table/tr//td[2]/table[1]" entry = Selector(text=entry_source).xpath(trip_info_xpath + "//td//blockquote") if entry: entry = entry.extract()[0] entry_start = str.find(entry, "<blockquote>") + len("<blockquote>") entry = entry[entry_start:len(entry)] entry = entry[0:str.find(entry, "<!---")] entry = str.replace(entry, "\r", "") entry = str.replace(entry, "\n", "") entry = str.replace(entry, "\t", "") entry = str.replace(entry, "<br>", " ") entry = str.replace(entry, "\xa0", "") entry = str.strip(entry, " ") if entry != '': return entry else: return None
def parse(self, text): if not hasattr(text, 'xpath'): # this means it is not already a selector text = Selector(text=text) if self.css is not None: extracted = text.css(self.css).extract() elif self.xpath is not None: extracted = text.xpath(self.xpath).extract() else: if type(text) is Selector: extracted = [text.extract()] else: extracted = [text.text] result = [] for extracted_str in extracted: for match in self.regex.finditer(extracted_str): result.append(match.group(0)) if self.collect: return result return bool(result)
desired_width = 320 pd.set_option('display.width', desired_width) np.set_printoptions(linewidth=desired_width) pd.set_option('display.max_colwidth', 400) pd.set_option('display.max_columns', 30) pd.set_option('display.max_rows', 300) pd.options.display.float_format = '{:.3f}'.format url = 'https://gist.githubusercontent.com/jsdario/6d6c69398cb0c731' \ '11e49f1218960f79/raw/8d4fc4548d437e2a7203a5aeeace5477f598827d/el_quijote.txt' html = requests.get(url=url).content selection = Selector(text=html) raw_data = selection.extract() print(raw_data) ''' 1. Lemmatizer and stemming => root of words 2. Lowering and .isalpha() => homogeneization 3. Creating a corpus and dictionary => [(id, freq), (34, 514), ()] 4. Word2vec # 5. Tf-idf model **+ 6. n-grams (bigrams, trigrams, ...) ''' ''' (1) Text Analytics ''' print('Length of El Quijote: ', len(raw_data)) words = word_tokenize(raw_data, language='spanish') print('Words of El Quijote: ', len(words)) print('Type of object: ', type(words))
def css_pojie(self, result): res = requests.get(parse.urljoin(result), headers=self.headers) selector = Selector(text=res.text) fh = open("D:/py3code/ArticleSpider/ArticleSpider/svg.txt", "wb") fh.write(selector.extract()) fh.close()
def extract_task(self, content: scrapy.Selector, response: HtmlResponse): # TODO: обновить документацию """ Вытаскивает информацию о задаче из содержимого страницы :return: словарь с ключами по именам секций задачи и со значениями вида text, images, где images -- список кортежей вида (image_url, tex_view) """ # Вытаскивание текста # text = list(map(lambda s: s.strip(), ''.join(content.extract()).split('\n'))) # text = list(map(lambda s: s.strip(), re.split(r'(<br>|<p>|</p>|>\n)', ''.join(content.extract())))) text = list(map(lambda s: s.strip(), re.split(r'(</?\w{,10}|>)', ''.join(content.extract())))) task_dict = {} current_section = '' session_text = [] image_urls = [] images_urls_of_section = [] text_iterator = enumerate(text) tex_used = False while True: try: i, line = next(text_iterator) new_section = None except StopIteration: break if line == '<h3': next(text_iterator) next(text_iterator) i, line = next(text_iterator) new_section = re.findall(r'(Условие|Подсказка|Решение|Ответ|Источники и прецеденты использования)', line) if (not new_section) and (not current_section): continue if new_section: if current_section: session_text = ' '.join(filter(lambda s: s, session_text)) session_text = ' '.join(session_text.split()) session_text.replace('\'', '') task_dict[DEFAULT_NAMES[current_section]] = session_text, images_urls_of_section current_section = new_section[0] if current_section == 'Источники и прецеденты использования': break session_text = [] images_urls_of_section = [] continue if '<img' in line: i, line = next(text_iterator) src = re.search(r'src=\".+\d+\"', line).group() if src: tex = re.search(r'alt=\"\$(.|\n)+\$\"', line) if tex is None: image_src = src[5:-1] image_url = response.urljoin(image_src) image_urls.append(image_url) images_urls_of_section.append(image_url) else: tex_used = True continue if re.match(r'<\w{1,10}', line): while line != '>': i, line = next(text_iterator) continue if re.match(r'(</?\w{,10}|>)', line): continue if line: old_line = line if 'Также доступны документы в формате' in line or \ ('href' in line or line == 'TeX') or \ (line.endswith('>') and not line.endswith('-->')): continue line = line.strip() line = re.sub(r'(^>|!-- MATH|--)', '', line, re.S).strip() line = line.replace('\\begin{displaymath}', '$').replace('\\end{displaymath}', '$') if line: session_text.append(line) return task_dict, image_urls, tex_used