Python WebParserの例、WebParser Pythonの例

コード例 #1

0

ファイルを表示

    def download(self, lastNchapter=-1):
        if self.url is None:
            print("Error, no url loaded")
            exit(1)
        else:
            self.bookTitle = wp.getBookTitle(self.url)

        if 'html' in self.url:
            print('Menu URL error')
            exit(0)

        self.updatePath()

        chList = wp.getChapterList(self.url)
        currentLength = len(chList)

        if lastNchapter == -1:
            # 检查上次更新长度
            lastTimeLength = self.checkLog()
            # 检查结束
        else:
            lastTimeLength = int(currentLength) - int(lastNchapter) - 1

        if lastTimeLength > currentLength:
            print('Unexpected list length, check lastTimeLength')
            exit(1)
        elif lastTimeLength == currentLength:
            print(self.bookTitle + ' --> 未更新' + str(currentLength))
            exit(0)

        fileName = str(lastTimeLength +
                       1) + '-' + str(currentLength) + ' ' + self.bookTitle

        f = open(self.path + fileName + '.txt', 'a')

        for i in range(lastTimeLength, currentLength):
            if i == 0:
                f.write(self.bookTitle + '\n\n')
            f.write(wp.downloadFromPage(wp.rootUrl + chList[i]['href']) + '\n')
            print("\rDownloading " + self.bookTitle + ":" + str(
                int((i - lastTimeLength) * 100.0 /
                    (currentLength - lastTimeLength - 1))) + ' %',
                  end='',
                  flush=True)
            #sys.stdout.flush()
        sys.stdout.write('\n\n 下载完成：' + self.bookTitle + '\n\n')
        f.close()
        # 记录当前长度
        self.log(currentLength)
        return fileName + '.txt', self.path + fileName + '.txt'

コード例 #2

0

ファイルを表示

ファイル: WFReadable.py プロジェクト: Kudo/wammer-readable

    def parse(self):
        if self.html is None:
            try:
                dw = self.fetch_page(self.url)
                self.html = dw['content']
                if self.html == None:
                    return None

                if dw['type'] == 'image':
                    result = {}
                    result['images'] = []
                    result['images'].append({'url': self.url})
                    p = urlparse(self.url)
                    if 'netloc' in p:
                        result['provider_display'] = p.netloc.lower()
                    else:
                        result['provider_display'] = ''
                    result['url'] = self.url
                    result['type'] = 'image'
                    result['description'] = ''
                    result['content'] = dw['content']
                    result['title'] = ''
                    return result

                if dw['type'] == 'text':
                    result = {}
                    result['images'] = []
                    result['url'] = self.url
                    result['type'] = 'article'
                    content = dw['content'].strip()
                    result['description'] = self.summarize(content, 75)
                    result['content'] = content
                    result['title'] = self.summarize(content, 10)
                    return result
                    
            except IOError:
                raise PageFetchError

        result = {}
        try:
            wp = WebParser(self.html, self.url)
            (self.dom_tree, self.html) = wp.normalize()
            result = wp.extract()
        except Exception, e:
            stack = traceback.format_stack(sys.exc_info()[2].tb_frame)
            ss = "".join(stack)
            tb = traceback.format_tb(sys.exc_info()[2])
            stb = "".join(tb)
            raise WebParseError("{0}\n{1}\n{2}".format(stb, ss, e))

コード例 #3

0

ファイルを表示

ファイル: WFReadable.py プロジェクト: Kudo/wammer-readable

    def extract_content(self):
        if self.html is None:
            try:
                dw = self.fetch_page(self.url)
                self.html = dw['content']
                if self.html == None:
                    return None

                if dw['type'] == 'image':
                    result = {}
                    result['content'] = '<img src="{0}"/>'.format(self.url)
                    return result

                if dw['type'] == 'text':
                    result = {}
                    result['content'] = dw['content']
                    return result

            except IOError:
                raise PageFetchError

        if self.dom_tree is None:
            wp = WebParser(self.html, self.url)
            (self.dom_tree, self.html) = wp.normalize()

        import SiteParser
        try:
            result = {}
            site = SiteParser.Sites(self.url)
            if site.is_match():
                result = site.parse(self.html, self.dom_tree)
                if 'content' in result:
                    # strip continous space
                    result['content'] = re.sub(r'\s+', ' ', result['content'])

                soul_tree = lxml.html.fromstring(result['content'])
                soul_text_only = soul_tree.text_content()
                s = self.summarize(soul_text_only, 75)
                result['description'] = s

                return result
            return None
        except Exception, e:
            stack = traceback.format_stack(sys.exc_info()[2].tb_frame)
            ss = "".join(stack)
            tb = traceback.format_tb(sys.exc_info()[2])
            stb = "".join(tb)
            raise WebSummarizeError("{0}\n{1}\n{2}".format(stb, ss, e))

コード例 #4

0

ファイルを表示

 def __init__(self, bookTitle=None, url=None):
     if bookTitle:
         self.url, self.bookTitle = wp.searchBook(bookTitle)
         self.bookTitle.replace(' ', '')
         self.url = wp.rootUrl + self.url
     elif url:
         self.bookTitle = "untitled"
         self.url = url

コード例 #5

0

ファイルを表示

ファイル: WebAndFileSearcher.py プロジェクト: Abdul016/Python

def webAndFile():
    data_load.get_traversal_data()
    file_data = indexer.read_data()
    web_data = WebParser.webData()
    print("File data search:")
    print("====================================================")
    FileSearcher.fileSearch(file_data)
    print("Web data search:")
    print("====================================================")
    WebSearcher.webSearcher(web_data)

コード例 #6

0

ファイルを表示

ファイル: WFReadable.py プロジェクト: Kudo/wammer-readable

    def parse(self):
        if self.html is None:
            try:
                dw = self.fetch_page(self.url)
                self.html = dw['content']
                if self.html == None:
                    return None

                if dw['type'] == 'image':
                    result = {
                        'images': [{'url': self.url}],
                        'url': self.url,
                        'type': 'image',
                        'title': '',
                    }
                    return result

                if dw['type'] == 'text':
                    result = {
                        'images': [],
                        'url': self.url,
                        'type': 'article',
                        'title': '',
                    }
                    return result
                    
            except IOError:
                raise PageFetchError

        result = {}
        try:
            wp = WebParser(self.html, self.url)
            (self.dom_tree, self.html) = wp.normalize()
            result = wp.extractV2()
        except Exception, e:
            stack = traceback.format_stack(sys.exc_info()[2].tb_frame)
            ss = "".join(stack)
            tb = traceback.format_tb(sys.exc_info()[2])
            stb = "".join(tb)
            raise WebParseError("{0}\n{1}\n{2}".format(stb, ss, e))

コード例 #7

0

ファイルを表示

def main():
    luckDict = {}

    for p in range(0, 1):
        personalData = randInput()
        # print(personalData)

        # 함수 리스트
        funcs = WebParser.parsingAll()
        for func in funcs:
            scripts = func(personalData)
            for script in scripts:
                for word in script.split(' '):
                    if word in luckDict:
                        luckDict[word] = luckDict[word] + 1
                    else:
                        luckDict[word] = 1

    luckList = list(luckDict.items())
    luckList = sorted(luckList, key=itemgetter(1), reverse=True)
    print(luckList)

コード例 #8

0

ファイルを表示

class NightbotParser:
    def __init__(self, url, debug_mode, output_file_name):
        self.url = url
        self.debugMode = debug_mode
        self.outputFileName = output_file_name
        self.webParser = None

    def initialise(self):
        #initialise WebParser class
        self.webParser = WebParser(self.url, self.debugMode,
                                   self.outputFileName)
        self.webParser.initialise()

    def with_builders(self, href):
        return href and re.compile("builders").search(href)

    def with_builds(self, href):
        return href and re.compile("builds").search(href)

    def without_builds(self, href):
        return href and not re.compile("builds").search(href)

    def createBuildSlaveListItem(self, tag):
        multi_dim_list = []

        #Create list as Build slave name and Build slave builders
        multi_dim_list.append(tag.b.a.string)  #append build slave name
        #print "tag a name : ", build_slave_tag.b.a.string

        tags_a = tag.find_all(href=self.with_builders)
        for tag_a in tags_a:
            #print "tag a name-> : ", tag_a.string
            multi_dim_list.append(tag_a.string)  #append build builder name

        return multi_dim_list

    def getBuildSlaveList(self):
        tag = 'tr'
        attrs = {'class': ['alt', '']}
        build_slave_tags = self.webParser.findTagWithAttrs(tag, attrs)

        build_slave_list = []
        for build_slave_tag in build_slave_tags:
            build_slave_list.append(
                self.createBuildSlaveListItem(build_slave_tag))

        return build_slave_list

    def createBuildSlaveStatusListItem(self, tag):
        multi_dim_list = []
        #Create list as Builder name and Build number
        tags_a_builder_name = tag.find_all(href=self.without_builds)
        tags_a_build_number = tag.find_all(href=self.with_builds)

        for builder_name, build_number in zip(tags_a_builder_name,
                                              tags_a_build_number):
            multi_dim_list.append(builder_name.string)  #append builder name
            multi_dim_list.append(build_number.string)  #append build number
            #print "Builder name : ", builder_name.string
            #print "Build number : ", build_number.string

        return multi_dim_list

    def getBuildSlaveStatusList(self):
        tag = 'li'
        build_slave_status_tags = self.webParser.findTag(tag)

        build_slave_status_list = []
        for build_slave_status_tag in build_slave_status_tags:
            build_slave_status_list.append(
                self.createBuildSlaveStatusListItem(build_slave_status_tag))

        return build_slave_status_list

    def getBuildInformation(self):
        build_info_list = {
            'Build In progress:': 'Completed',
            'Result:': 'Not completed',
            'Revision:': 'Not idendified',
            'Reason:': 'Not determined'
        }
        tag = 'div'
        attrs = {'class': 'column'}
        build_info_result = self.webParser.findTagWithAttrsAndLimit(
            tag, 1, attrs)

        #Get build in progress info
        string_to_search = "Build In Progress:"
        build_tag = build_info_result[0].find_all('h2',
                                                  string=string_to_search)
        if len(build_tag[0].next_sibling.string.strip()) is not 0:
            build_info_list['Build In progress:'] = build_tag[
                0].next_sibling.string.strip()
        else:
            build_info_list['Build In progress:'] = build_tag[
                0].next_sibling.next_sibling.string.strip()

        #Get build result
        string_to_search = "Results"
        build_tag = build_info_result[0].find_all('h2',
                                                  string=string_to_search)
        if len(build_tag) is not 0:
            build_info_list['Result:'] = build_tag[0].next_sibling.string

        #Get build svn revision
        string_to_search = "Got Revision"
        build_tag = build_info_result[0].find_all('td',
                                                  string=string_to_search)
        if len(build_tag) is not 0:
            build_info_list['Revision:'] = build_tag[0].next_sibling.string

        #Get build reason
        build_tag = build_info_result[0].find_all('p', text=re.compile("'"))
        build_reason_string = build_tag[0].string.strip()
        build_reason_string = build_reason_string[:build_reason_string.
                                                  rfind("'")]
        build_reason_string = build_reason_string[build_reason_string.rfind("'"
                                                                            ):]
        build_info_list['Reason:'] = build_reason_string

        return build_info_list

コード例 #9

0

ファイルを表示

 def initialise(self):
     #initialise WebParser class
     self.webParser = WebParser(self.url, self.debugMode,
                                self.outputFileName)
     self.webParser.initialise()

コード例 #10

0

ファイルを表示

ファイル: WebSearcher.py プロジェクト: m0h4m3d/Python

def webSearchCall():
    web_data = WebParser.webParserData()
    webSearcher(web_data)

コード例 #11

0

ファイルを表示

ファイル: Request.py プロジェクト: Ted96/Di_Bot

def action_switcher(intent: str, parameter: str):
    global index
    global myd

    # print('\taction_switcher()')
    # An rwthsei gia to faq
    if intent == "faq-location":
        return "Your university is located here: " + \
            "https://www.google.com/maps/place/Department+of+Informatics+and+Telecommunications/@37.968141,23.7643221,17z"

    elif intent == "mystudies-grade":

        if parameter == '':
            return "please ask \"what is my grade on <course>\""

        wb = WebParser.SeleniumWebParser()
        grade = wb.get_grade_of(parameter)

        return 'Your grade is ' + grade

    elif intent == 'eclass-deadlines':
        return """
		The deadline for your assignments in ΗΛΕΚΤΡΟΜΑΓΝΗΤΙΣΜΟΣ, ΟΠΤΙΚΗ, ΣΥΓΧΡΟΝΗ ΦΥΣΙΚΗ are:
		3η Εργασία Φυσικής
		Time remaining: 32 days 22 hours 31 minutes

		"""

    elif intent == "mystudies-grade-avg":

        wb = WebParser.SeleniumWebParser()
        grade = wb.get_average_grades()

        print('exit from func gpa, res = ', grade)

        return 'Your gpa is ' + grade

    elif intent == "mystudies-courses_declaration":
        pass

    elif intent == "eclass-announcement-course":

        wb = WebParser.SeleniumWebParser()
        announcement = wb.get_eclass_element(0, parameter)

        return """Most recent announcement from """ + parameter + """ : 
		""" + announcement

    elif intent == "eclass-deadline" or intent == "eclass-announcements ":
        return "Not implemented yet."
        pass

    elif intent == 'faq-pps':
        return "The university courses can be found here: http://www.di.uoa.gr/undergraduate/coursesnew"

    elif intent == "test__name":
        return 'Hello I am DiBot!'

    elif intent == 'help':
        return """
- name (whats ur name?) 
- faq: university location (where is university?)
- faq: curriculum (what courses are offered here?)'
- eclass: course deadlines (whats my next deadlines on <course> )
- eclass: course announcements (any news from course <course> )
- mystudies: course grade (whats my grade on <course>)
- mystudies: average grade (what is my gpa)
"""

    return "I din't quite understand :( "

コード例 #12

0

ファイルを表示

 def update(self,html):
     wp.WebParser({"name":"entreprise name","teaser":"teaser"},
                  self)

Python WebParser, mangroveの例