Esempio n. 1
0
 def _parse_handler(self, response):
     self.logger.info("parse url---:{0}".format(response.url))
     self.driver.get(response.url)
     # Selector allows you to select parts of an XML or HTML text using CSS
     # or XPath expressions and extract data from it.
     selector = Selector(text=self.driver.page_source)
     print(selector.extract())
Esempio n. 2
0
 def name_vn(cls, input_value):
     if isinstance(input_value, list):
         input_value = ''.join(input_value)
         name_1 = Selector(text=input_value).xpath("//span[@itemprop='title']//text()").extract()
         try:
             name_1 = str(''.join(name_1))
             name = Selector(text=input_value).xpath(
                 "//h1//text()")
             if len(name.re("(.*-.*-.*)")) > 0:
                 try:
                     name_en = str(''.join(name.re("(.*-.*-.*)")))
                     return
                 except UnicodeEncodeError:
                     return ''.join(name.re("(.*-.*-.*)"))
             elif len(name.re("(.*)-")) > 0:
                 name_vn = name.re("(.*)-")
                 return name_vn
             else:
                 name = name.extract()
                 if isinstance(name, list):
                     name = ''.join(name)
                     try:
                         name = str(name)
                         return
                     except UnicodeEncodeError:
                         return name
         except UnicodeEncodeError:
             return ''.join(name_1)
     else:
         return
Esempio n. 3
0
    def parse(self, response):
        self.driver = webdriver.Firefox(
            executable_path=settings.get('WEB_DRIVER_PATH'))
        self.driver.get(response.request.url)

        if hasattr(self, 'page_loaded_flag'):
            WebDriverWait(self.driver, settings.get('DRIVER_TIMEOUT')).until(
                EC.visibility_of_element_located(
                    (By.CLASS_NAME, self.page_loaded_flag)))

        if 'select_form' in self.xpath:
            select_form = self.xpath.get('select_form')

            WebDriverWait(self.driver, settings.get('DRIVER_TIMEOUT')).until(
                EC.visibility_of_element_located((By.XPATH, select_form)))
            self.driver.find_element_by_xpath(select_form).click()
            if hasattr(self, 'post_load_content_flag'):
                WebDriverWait(self.driver,
                              settings.get('DRIVER_TIMEOUT')).until(
                                  EC.presence_of_element_located(
                                      (By.CLASS_NAME,
                                       self.post_load_content_flag)))
        selector = Selector(text=self.driver.page_source)

        if 'block_xpath' in self.xpath:
            block_xpath = self.xpath.get('block_xpath')
            elements_xpath = self.xpath.get('elements_xpath')

            WebDriverWait(self.driver, settings.get('DRIVER_TIMEOUT')).until(
                EC.visibility_of_element_located((By.XPATH, block_xpath)))

            team_link = Selector(text=selector.xpath(
                self.xpath.get('block_xpath')).extract_first())

            WebDriverWait(self.driver, settings.get('DRIVER_TIMEOUT')).until(
                EC.visibility_of_element_located((By.XPATH, elements_xpath)))
            data = team_link.xpath(self.xpath.get('elements_xpath'))
        else:
            elements_xpath = self.xpath.get('elements_xpath')

            WebDriverWait(self.driver, settings.get('DRIVER_TIMEOUT')).until(
                EC.presence_of_element_located((By.XPATH, elements_xpath)))

            data = selector.xpath(elements_xpath)

        result = dict()
        for team_link in data:
            row = Selector(text=team_link.extract())
            name = row.xpath(
                self.xpath.get('element_xpath_name')).extract_first()
            coeff = row.xpath(
                self.xpath.get('element_xpath_rate')).extract_first()
            if not coeff or not name:
                continue
            name = name.strip()

            result[name] = coeff.strip('\n').replace('\t',
                                                     '').replace('\n', '')
        self.driver.quit()
        yield result
Esempio n. 4
0
 def name_en(cls, input_value):
     try:
         if isinstance(input_value, list):
             input_value = ''.join(input_value)
             name = Selector(text=input_value).xpath(
                 "//h1//text()")
             if len(name.re("-.*-(.*)")) > 0:
                 try:
                     name_en = str(''.join(name.re("-.*-(.*)")))
                     return name_en
                 except UnicodeEncodeError:
                     return
             elif len(name.re("-(.*)")) > 0:
                 name_en = name.re("-(.*)")
                 return name_en
             else:
                 name = name.extract()
                 if isinstance(name, list):
                     name = ''.join(name)
                     try:
                         name = str(name)
                         return name
                     except UnicodeEncodeError:
                         return
         else:
             return
     except Exception as e:
         print e
Esempio n. 5
0
 def parse(self, response):
     car_id = response.meta['car_id']
     # pattern = re.compile(r"var option = ({.*}?);", re.MULTILINE | re.DOTALL)
     autohome_car = Selector(response).xpath(
         '//script[contains(., "var option")]/text()')
     if autohome_car:
         autohome_car_content = autohome_car.extract()[0]
         utils.save_autohome(car_id, autohome_car_content)
Esempio n. 6
0
    def parseDetail(self, response):
        item = response.meta['item']
        selector = Selector(response)
        publishingHouse = re.search('<span class="pl">出版社:</span>(.*?)<br',
                                    selector.extract(), re.S).group(1).replace(
                                        ' ', '').replace('\n', '')
        publishingTime = re.search('<span class="pl">出版年:</span>(.*?)<br',
                                   selector.extract(), re.S).group(1).replace(
                                       ' ', '').replace('\n', '')
        price = re.search('<span class="pl">定价:</span>(.*?)<br',
                          selector.extract(),
                          re.S).group(1).replace(' ', '').replace('\n', '')

        item["publishingHouse"] = publishingHouse
        item["publishingTime"] = publishingTime
        item["price"] = price
        yield item
Esempio n. 7
0
 def _get_track_data(self, track_block):
     track_block = Selector(text=track_block.extract())
     return {
         "artist":
         track_block.xpath(self._locator_dictionary["artist_name"]).get(),
         "track":
         track_block.xpath(self._locator_dictionary["track_name"]).get(),
     }
Esempio n. 8
0
 def extractMixture(self, block: Selector):
     text = block.extract()
     text = text.replace('\n', '')
     text = DD_REGEX.sub("", text)
     text = HREF_REGEX.sub("\\1", text)
     text = TAG_REGEX.sub("", text)
     text = text.strip()
     # print(text)
     return text
Esempio n. 9
0
def simple_judge(html):
    """
    judge if it is a tv series
    :param html: html source
    :return: bool result
    """
    info_sel = Selector(text=html).xpath('//div[@id="info"]')
    info = info_sel.extract()[0]
    if u'集数' in info:
        return 1
    return 0
Esempio n. 10
0
def extract_next_entry_url(domain, entry_source):
    navbar_xpath = "/html/body/table//tr[4]/td/table/tr//td[2]//table[1]//tr[1]//td[1]"
    next_entry_url = Selector(text=entry_source).xpath(navbar_xpath + "//*[contains(text(), 'Next')]")
    if next_entry_url:
        next_entry_url = next_entry_url.extract()[0]
        next_entry_url = next_entry_url[str.find(next_entry_url, "id", 0, len(next_entry_url)):len(next_entry_url)]
        next_entry_url = next_entry_url[0:str.find(next_entry_url, ">")]
        next_entry_url = str.strip(next_entry_url, "\"")
        next_entry_url = "entry.cfm?" + next_entry_url
        next_entry_url = domain + next_entry_url
        return next_entry_url
    else:
        return None
Esempio n. 11
0
def extract_entry_trip_mileage(entry_source):
    trip_info_xpath = "/html/body/table//tr[4]/td/table/tr//td[2]//table[1]//tr[3]"
    entry_trip_mileage = Selector(text=entry_source).xpath(trip_info_xpath + "//td//span[contains(text(), 'Trip Miles')]/following::span")
    if entry_trip_mileage:
        entry_trip_mileage = entry_trip_mileage.extract()[0]
        mileage_start = entry_trip_mileage.find(">") + len(">")
        entry_trip_mileage = entry_trip_mileage[mileage_start:len(entry_trip_mileage)]
        entry_trip_mileage = entry_trip_mileage[0:entry_trip_mileage.find("<")]
        if entry_trip_mileage != '':
            return float(entry_trip_mileage)
        else:
            return None
    else:
        return None
Esempio n. 12
0
def jd_search(keys):
    conf = ConfigureParser('./__configure__/configure.xml')
    output_file = conf.get_configure_by_tag_name('output_file')
    xpath = conf.get_configure_by_tag_name('xpath')
    for key in keys:
        source = get_html(handle_type(key))
        try:
            sel = Selector(text=source).xpath(xpath)
            num = str(num_trans(sel.extract()[0]))
        except:
            num = '0'
        print handle_type(key), num
        with open(output_file, 'ab') as f:
            f.write(key + '\t' + num + '\n')
Esempio n. 13
0
def extract_entry_start_loc(entry_source):
    trip_info_xpath = "/html/body/table//tr[4]/td/table/tr//td[2]//table[1]//tr[3]"
    start_loc = Selector(text=entry_source).xpath(trip_info_xpath + "//td//span[contains(text(), 'Starting Location')]/following::span[1]")
    if start_loc:
        start_loc = start_loc.extract()[0]
        start_loc_start = start_loc.find(">") + len(">")
        start_loc = start_loc[start_loc_start:len(start_loc)]
        start_loc = start_loc[0:start_loc.find("<")]
        if start_loc != '':
            return start_loc
        else:
            return None
    else:
        return None
Esempio n. 14
0
def extract_entry_day_mileage(entry_source):
    trip_info_xpath = "/html/body/table//tr[4]/td/table/tr//td[2]/table[1]"
    day_mileage = Selector(text=entry_source).xpath(trip_info_xpath + "//td//span[contains(text(), 'Today')]/following::span")
    if day_mileage:
        day_mileage = day_mileage.extract()[0]
        day_mileage_start = str.find(day_mileage, ">") + len(">")
        day_mileage = day_mileage[day_mileage_start:len(day_mileage)]
        day_mileage = day_mileage[0:day_mileage.find("<")]
        if day_mileage != '':
            return float(day_mileage)
        else:
            return None
    else:
        return None
Esempio n. 15
0
def extract_entry_destination(entry_source):
    trip_info_xpath = "/html/body/table//tr[4]/td/table/tr//td[2]//table[1]//tr[3]"
    destination = Selector(text=entry_source).xpath(trip_info_xpath + "//td//span[contains(text(), 'Destination')]/following::span[1]")
    if destination:
        destination = destination.extract()[0]
        destination_start = destination.find(">") + len(">")
        destination = destination[destination_start:len(destination)]
        destination = destination[0:destination.find("<")]
        if destination != '':
            return destination
        else:
            return None
    else:
        return None
Esempio n. 16
0
def extract_first_journal_url(journal_url):
    domain = "http://www.trailjournals.com/"
    with contextlib.closing(urlopen(journal_url)) as fp:
        source = fp.read()
    first_entry_url_xpath = "/html/body/table//tr[4]/td/table/tr//td[2]/table//tr[1]"
    first_entry_url = Selector(text=source).xpath(first_entry_url_xpath + "//a[contains(text(), 'First')]")
    if first_entry_url:
        first_entry_url = first_entry_url.extract()[0]
        # Not on the first journal page. Record the first entry url.
        url_start = first_entry_url.find("href=") + len("href=\"")
        first_entry_url = first_entry_url[url_start:len(first_entry_url)]
        first_entry_url = first_entry_url[0:first_entry_url.find("\"")]
        return domain + first_entry_url
    # Already on the first journal page.
    return journal_url
Esempio n. 17
0
 def parse_porn_info(self, response):
     pornItem = PornItem()
     selector = Selector(response)
     _ph_info = re.findall('flashvars_.*?=(.*?);\n', selector.extract())
     _ph_info_json = json.loads(_ph_info[0])
     duration = _ph_info_json.get('video_duration')
     pornItem['video_duration'] = duration
     title = _ph_info_json.get('video_title')
     pornItem['title'] = title
     image_url = _ph_info_json.get('image_url')
     pornItem['image_url'] = image_url
     url = _ph_info_json.get('link_url')
     pornItem['url'] = url
     video_url = _ph_info_json.get('quality_480p')
     pornItem['video_url'] = video_url
     yield pornItem
Esempio n. 18
0
def extract_entry(entry_source):
    trip_info_xpath = "/html/body/table//tr[4]/td/table/tr//td[2]/table[1]"
    entry = Selector(text=entry_source).xpath(trip_info_xpath + "//td//blockquote")
    if entry:
        entry = entry.extract()[0]
        entry_start = str.find(entry, "<blockquote>") + len("<blockquote>")
        entry = entry[entry_start:len(entry)]
        entry = entry[0:str.find(entry, "<!---")]
        entry = str.replace(entry, "\r", "")
        entry = str.replace(entry, "\n", "")
        entry = str.replace(entry, "\t", "")
        entry = str.replace(entry, "<br>", " ")
        entry = str.replace(entry, "\xa0", "")
        entry = str.strip(entry, " ")
        if entry != '':
            return entry
        else:
            return None
Esempio n. 19
0
    def parse(self, text):
        if not hasattr(text, 'xpath'):
            # this means it is not already a selector
            text = Selector(text=text)

        if self.css is not None:
            extracted = text.css(self.css).extract()
        elif self.xpath is not None:
            extracted = text.xpath(self.xpath).extract()
        else:
            if type(text) is Selector:
                extracted = [text.extract()]
            else:
                extracted = [text.text]

        result = []
        for extracted_str in extracted:
            for match in self.regex.finditer(extracted_str):
                result.append(match.group(0))

        if self.collect:
            return result
        return bool(result)
Esempio n. 20
0
desired_width = 320
pd.set_option('display.width', desired_width)
np.set_printoptions(linewidth=desired_width)
pd.set_option('display.max_colwidth', 400)
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 300)
pd.options.display.float_format = '{:.3f}'.format


url = 'https://gist.githubusercontent.com/jsdario/6d6c69398cb0c731' \
      '11e49f1218960f79/raw/8d4fc4548d437e2a7203a5aeeace5477f598827d/el_quijote.txt'

html = requests.get(url=url).content
selection = Selector(text=html)
raw_data = selection.extract()
print(raw_data)
''' 
1. Lemmatizer and stemming => root of words
2. Lowering and .isalpha() => homogeneization
3. Creating a corpus and dictionary => [(id, freq), (34, 514), ()]
4. Word2vec # 
5. Tf-idf model **+ 
6. n-grams (bigrams, trigrams, ...)
'''
''' (1) Text Analytics '''
print('Length of El Quijote: ', len(raw_data))

words = word_tokenize(raw_data, language='spanish')
print('Words of El Quijote: ', len(words))
print('Type of object: ', type(words))
Esempio n. 21
0
 def css_pojie(self, result):
     res = requests.get(parse.urljoin(result), headers=self.headers)
     selector = Selector(text=res.text)
     fh = open("D:/py3code/ArticleSpider/ArticleSpider/svg.txt", "wb")
     fh.write(selector.extract())
     fh.close()
Esempio n. 22
0
    def extract_task(self, content: scrapy.Selector, response: HtmlResponse):
        # TODO: обновить документацию
        """
        Вытаскивает информацию о задаче из содержимого страницы
        :return: словарь с ключами по именам секций задачи и со значениями вида
                 text, images, где images -- список кортежей вида (image_url, tex_view)
        """
        # Вытаскивание текста
        # text = list(map(lambda s: s.strip(), ''.join(content.extract()).split('\n')))
        # text = list(map(lambda s: s.strip(), re.split(r'(<br>|<p>|</p>|>\n)', ''.join(content.extract()))))
        text = list(map(lambda s: s.strip(), re.split(r'(</?\w{,10}|>)', ''.join(content.extract()))))

        task_dict = {}
        current_section = ''
        session_text = []
        image_urls = []
        images_urls_of_section = []
        text_iterator = enumerate(text)
        tex_used = False
        while True:
            try:
                i, line = next(text_iterator)
                new_section = None
            except StopIteration:
                break

            if line == '<h3':
                next(text_iterator)
                next(text_iterator)
                i, line = next(text_iterator)
                new_section = re.findall(r'(Условие|Подсказка|Решение|Ответ|Источники и прецеденты использования)', line)

            if (not new_section) and (not current_section):
                continue
            if new_section:
                if current_section:
                    session_text = ' '.join(filter(lambda s: s, session_text))
                    session_text = ' '.join(session_text.split())
                    session_text.replace('\'', '')
                    task_dict[DEFAULT_NAMES[current_section]] = session_text, images_urls_of_section
                current_section = new_section[0]
                if current_section == 'Источники и прецеденты использования':
                    break
                session_text = []
                images_urls_of_section = []
                continue
            if '<img' in line:
                i, line = next(text_iterator)
                src = re.search(r'src=\".+\d+\"', line).group()
                if src:
                    tex = re.search(r'alt=\"\$(.|\n)+\$\"', line)
                    if tex is None:
                        image_src = src[5:-1]
                        image_url = response.urljoin(image_src)
                        image_urls.append(image_url)
                        images_urls_of_section.append(image_url)
                    else:
                        tex_used = True
                continue
            if re.match(r'<\w{1,10}', line):
                while line != '>':
                    i, line = next(text_iterator)
                continue
            if re.match(r'(</?\w{,10}|>)', line):
                continue
            if line:
                old_line = line
                if 'Также доступны документы в формате' in line or \
                        ('href' in line or line == 'TeX') or \
                        (line.endswith('>') and not line.endswith('-->')):
                    continue
                line = line.strip()
                line = re.sub(r'(^>|!-- MATH|--)', '', line, re.S).strip()
                line = line.replace('\\begin{displaymath}', '$').replace('\\end{displaymath}', '$')
                if line:
                    session_text.append(line)

        return task_dict, image_urls, tex_used