def download(self, lastNchapter=-1): if self.url is None: print("Error, no url loaded") exit(1) else: self.bookTitle = wp.getBookTitle(self.url) if 'html' in self.url: print('Menu URL error') exit(0) self.updatePath() chList = wp.getChapterList(self.url) currentLength = len(chList) if lastNchapter == -1: # 检查上次更新长度 lastTimeLength = self.checkLog() # 检查结束 else: lastTimeLength = int(currentLength) - int(lastNchapter) - 1 if lastTimeLength > currentLength: print('Unexpected list length, check lastTimeLength') exit(1) elif lastTimeLength == currentLength: print(self.bookTitle + ' --> 未更新' + str(currentLength)) exit(0) fileName = str(lastTimeLength + 1) + '-' + str(currentLength) + ' ' + self.bookTitle f = open(self.path + fileName + '.txt', 'a') for i in range(lastTimeLength, currentLength): if i == 0: f.write(self.bookTitle + '\n\n') f.write(wp.downloadFromPage(wp.rootUrl + chList[i]['href']) + '\n') print("\rDownloading " + self.bookTitle + ":" + str( int((i - lastTimeLength) * 100.0 / (currentLength - lastTimeLength - 1))) + ' %', end='', flush=True) #sys.stdout.flush() sys.stdout.write('\n\n 下载完成:' + self.bookTitle + '\n\n') f.close() # 记录当前长度 self.log(currentLength) return fileName + '.txt', self.path + fileName + '.txt'
def parse(self): if self.html is None: try: dw = self.fetch_page(self.url) self.html = dw['content'] if self.html == None: return None if dw['type'] == 'image': result = {} result['images'] = [] result['images'].append({'url': self.url}) p = urlparse(self.url) if 'netloc' in p: result['provider_display'] = p.netloc.lower() else: result['provider_display'] = '' result['url'] = self.url result['type'] = 'image' result['description'] = '' result['content'] = dw['content'] result['title'] = '' return result if dw['type'] == 'text': result = {} result['images'] = [] result['url'] = self.url result['type'] = 'article' content = dw['content'].strip() result['description'] = self.summarize(content, 75) result['content'] = content result['title'] = self.summarize(content, 10) return result except IOError: raise PageFetchError result = {} try: wp = WebParser(self.html, self.url) (self.dom_tree, self.html) = wp.normalize() result = wp.extract() except Exception, e: stack = traceback.format_stack(sys.exc_info()[2].tb_frame) ss = "".join(stack) tb = traceback.format_tb(sys.exc_info()[2]) stb = "".join(tb) raise WebParseError("{0}\n{1}\n{2}".format(stb, ss, e))
def extract_content(self): if self.html is None: try: dw = self.fetch_page(self.url) self.html = dw['content'] if self.html == None: return None if dw['type'] == 'image': result = {} result['content'] = '<img src="{0}"/>'.format(self.url) return result if dw['type'] == 'text': result = {} result['content'] = dw['content'] return result except IOError: raise PageFetchError if self.dom_tree is None: wp = WebParser(self.html, self.url) (self.dom_tree, self.html) = wp.normalize() import SiteParser try: result = {} site = SiteParser.Sites(self.url) if site.is_match(): result = site.parse(self.html, self.dom_tree) if 'content' in result: # strip continous space result['content'] = re.sub(r'\s+', ' ', result['content']) soul_tree = lxml.html.fromstring(result['content']) soul_text_only = soul_tree.text_content() s = self.summarize(soul_text_only, 75) result['description'] = s return result return None except Exception, e: stack = traceback.format_stack(sys.exc_info()[2].tb_frame) ss = "".join(stack) tb = traceback.format_tb(sys.exc_info()[2]) stb = "".join(tb) raise WebSummarizeError("{0}\n{1}\n{2}".format(stb, ss, e))
def __init__(self, bookTitle=None, url=None): if bookTitle: self.url, self.bookTitle = wp.searchBook(bookTitle) self.bookTitle.replace(' ', '') self.url = wp.rootUrl + self.url elif url: self.bookTitle = "untitled" self.url = url
def webAndFile(): data_load.get_traversal_data() file_data = indexer.read_data() web_data = WebParser.webData() print("File data search:") print("====================================================") FileSearcher.fileSearch(file_data) print("Web data search:") print("====================================================") WebSearcher.webSearcher(web_data)
def parse(self): if self.html is None: try: dw = self.fetch_page(self.url) self.html = dw['content'] if self.html == None: return None if dw['type'] == 'image': result = { 'images': [{'url': self.url}], 'url': self.url, 'type': 'image', 'title': '', } return result if dw['type'] == 'text': result = { 'images': [], 'url': self.url, 'type': 'article', 'title': '', } return result except IOError: raise PageFetchError result = {} try: wp = WebParser(self.html, self.url) (self.dom_tree, self.html) = wp.normalize() result = wp.extractV2() except Exception, e: stack = traceback.format_stack(sys.exc_info()[2].tb_frame) ss = "".join(stack) tb = traceback.format_tb(sys.exc_info()[2]) stb = "".join(tb) raise WebParseError("{0}\n{1}\n{2}".format(stb, ss, e))
def main(): luckDict = {} for p in range(0, 1): personalData = randInput() # print(personalData) # 함수 리스트 funcs = WebParser.parsingAll() for func in funcs: scripts = func(personalData) for script in scripts: for word in script.split(' '): if word in luckDict: luckDict[word] = luckDict[word] + 1 else: luckDict[word] = 1 luckList = list(luckDict.items()) luckList = sorted(luckList, key=itemgetter(1), reverse=True) print(luckList)
class NightbotParser: def __init__(self, url, debug_mode, output_file_name): self.url = url self.debugMode = debug_mode self.outputFileName = output_file_name self.webParser = None def initialise(self): #initialise WebParser class self.webParser = WebParser(self.url, self.debugMode, self.outputFileName) self.webParser.initialise() def with_builders(self, href): return href and re.compile("builders").search(href) def with_builds(self, href): return href and re.compile("builds").search(href) def without_builds(self, href): return href and not re.compile("builds").search(href) def createBuildSlaveListItem(self, tag): multi_dim_list = [] #Create list as Build slave name and Build slave builders multi_dim_list.append(tag.b.a.string) #append build slave name #print "tag a name : ", build_slave_tag.b.a.string tags_a = tag.find_all(href=self.with_builders) for tag_a in tags_a: #print "tag a name-> : ", tag_a.string multi_dim_list.append(tag_a.string) #append build builder name return multi_dim_list def getBuildSlaveList(self): tag = 'tr' attrs = {'class': ['alt', '']} build_slave_tags = self.webParser.findTagWithAttrs(tag, attrs) build_slave_list = [] for build_slave_tag in build_slave_tags: build_slave_list.append( self.createBuildSlaveListItem(build_slave_tag)) return build_slave_list def createBuildSlaveStatusListItem(self, tag): multi_dim_list = [] #Create list as Builder name and Build number tags_a_builder_name = tag.find_all(href=self.without_builds) tags_a_build_number = tag.find_all(href=self.with_builds) for builder_name, build_number in zip(tags_a_builder_name, tags_a_build_number): multi_dim_list.append(builder_name.string) #append builder name multi_dim_list.append(build_number.string) #append build number #print "Builder name : ", builder_name.string #print "Build number : ", build_number.string return multi_dim_list def getBuildSlaveStatusList(self): tag = 'li' build_slave_status_tags = self.webParser.findTag(tag) build_slave_status_list = [] for build_slave_status_tag in build_slave_status_tags: build_slave_status_list.append( self.createBuildSlaveStatusListItem(build_slave_status_tag)) return build_slave_status_list def getBuildInformation(self): build_info_list = { 'Build In progress:': 'Completed', 'Result:': 'Not completed', 'Revision:': 'Not idendified', 'Reason:': 'Not determined' } tag = 'div' attrs = {'class': 'column'} build_info_result = self.webParser.findTagWithAttrsAndLimit( tag, 1, attrs) #Get build in progress info string_to_search = "Build In Progress:" build_tag = build_info_result[0].find_all('h2', string=string_to_search) if len(build_tag[0].next_sibling.string.strip()) is not 0: build_info_list['Build In progress:'] = build_tag[ 0].next_sibling.string.strip() else: build_info_list['Build In progress:'] = build_tag[ 0].next_sibling.next_sibling.string.strip() #Get build result string_to_search = "Results" build_tag = build_info_result[0].find_all('h2', string=string_to_search) if len(build_tag) is not 0: build_info_list['Result:'] = build_tag[0].next_sibling.string #Get build svn revision string_to_search = "Got Revision" build_tag = build_info_result[0].find_all('td', string=string_to_search) if len(build_tag) is not 0: build_info_list['Revision:'] = build_tag[0].next_sibling.string #Get build reason build_tag = build_info_result[0].find_all('p', text=re.compile("'")) build_reason_string = build_tag[0].string.strip() build_reason_string = build_reason_string[:build_reason_string. rfind("'")] build_reason_string = build_reason_string[build_reason_string.rfind("'" ):] build_info_list['Reason:'] = build_reason_string return build_info_list
def initialise(self): #initialise WebParser class self.webParser = WebParser(self.url, self.debugMode, self.outputFileName) self.webParser.initialise()
def webSearchCall(): web_data = WebParser.webParserData() webSearcher(web_data)
def action_switcher(intent: str, parameter: str): global index global myd # print('\taction_switcher()') # An rwthsei gia to faq if intent == "faq-location": return "Your university is located here: " + \ "https://www.google.com/maps/place/Department+of+Informatics+and+Telecommunications/@37.968141,23.7643221,17z" elif intent == "mystudies-grade": if parameter == '': return "please ask \"what is my grade on <course>\"" wb = WebParser.SeleniumWebParser() grade = wb.get_grade_of(parameter) return 'Your grade is ' + grade elif intent == 'eclass-deadlines': return """ The deadline for your assignments in ΗΛΕΚΤΡΟΜΑΓΝΗΤΙΣΜΟΣ, ΟΠΤΙΚΗ, ΣΥΓΧΡΟΝΗ ΦΥΣΙΚΗ are: 3η Εργασία Φυσικής Time remaining: 32 days 22 hours 31 minutes """ elif intent == "mystudies-grade-avg": wb = WebParser.SeleniumWebParser() grade = wb.get_average_grades() print('exit from func gpa, res = ', grade) return 'Your gpa is ' + grade elif intent == "mystudies-courses_declaration": pass elif intent == "eclass-announcement-course": wb = WebParser.SeleniumWebParser() announcement = wb.get_eclass_element(0, parameter) return """Most recent announcement from """ + parameter + """ : """ + announcement elif intent == "eclass-deadline" or intent == "eclass-announcements ": return "Not implemented yet." pass elif intent == 'faq-pps': return "The university courses can be found here: http://www.di.uoa.gr/undergraduate/coursesnew" elif intent == "test__name": return 'Hello I am DiBot!' elif intent == 'help': return """ - name (whats ur name?) - faq: university location (where is university?) - faq: curriculum (what courses are offered here?)' - eclass: course deadlines (whats my next deadlines on <course> ) - eclass: course announcements (any news from course <course> ) - mystudies: course grade (whats my grade on <course>) - mystudies: average grade (what is my gpa) """ return "I din't quite understand :( "
def update(self,html): wp.WebParser({"name":"entreprise name","teaser":"teaser"}, self)