def submit(day: int, part: int) -> None: if AOC_SESSION_COOKIE is None: raise ValueError('Missing AOC_SESSION_COOKIE!') part_word = 'one' if part == 1 else 'two' solution_module = importlib.import_module(f'{YEAR}.{day:02}.solution') answer_func = getattr(solution_module, f'part_{part_word}') problem_input = getattr(solution_module, 'parse_data')() answer = answer_func(problem_input) resp = requests.post(f'https://adventofcode.com/{YEAR}/day/{day}/answer', cookies={'session': AOC_SESSION_COOKIE}, data={ 'level': part, 'answer': answer }) if not resp.ok: raise ValueError(f'Bad response from site: {resp.status_code}') msg = BeautifulSoup(resp.text, 'html.parser').article.text if msg.startswith("That's the") and part == 1: webbrowser.open(resp.url) print(f'Day {day:02} Part {part:02}: {msg}')
def findvideos(item): logger.info() itemlist = list() soup = create_soup(item.url) matches = soup.find("ul", id="playeroptionsul") for elem in matches.find_all("li"): if "youtube" in elem.find("span", class_="server").text: continue post = { "action": "doo_player_ajax", "post": elem["data-post"], "nume": elem["data-nume"], "type": elem["data-type"] } headers = {"Referer": item.url} doo_url = "%swp-admin/admin-ajax.php" % host data = httptools.downloadpage(doo_url, post=post, headers=headers).json try: url = BeautifulSoup(data.get("embed_url", ""), "html5lib").find("iframe")["src"] except: continue if not url.startswith("http"): url = "https:%s" % url itemlist.append( Item(channel=item.channel, title="%s", action="play", url=url, language="LAT", infoLabels=item.infoLabels)) itemlist = servertools.get_servers_itemlist( itemlist, lambda x: x.title % x.server.capitalize()) # Requerido para FilterTools itemlist = filtertools.get_links(itemlist, item, list_language) # Requerido para AutoPlay autoplay.start(itemlist, item) if item.contentType != "episode": if config.get_videolibrary_support( ) and len(itemlist) > 0 and item.extra != "findvideos": itemlist.append( Item( channel=item.channel, title= "[COLOR yellow]Añadir esta pelicula a la videoteca[/COLOR]", url=item.url, action="add_pelicula_to_library", extra="findvideos", contentTitle=item.contentTitle)) return itemlist
def crude_parsing(self): crude_list = [] standard_fields = [ "from:", "to:", "cc:", "bcc:", "mime-version:", "content-type:", "x-from:", "x-to:", "x-cc:", "content-transfer-encoding:", "x-bcc:", "x-filename", "subject:", "message-id:", "x-origin:" ] with open(self.origin_file) as f: for line in f: line = line.decode("utf-8", "ignore").encode("utf-8").lower() try: line = BeautifulSoup(line, "html.parser").getText() except Exception as e: line = "" line = line.lower() if line in ['\n', '\r\n']: crude_list.append("content: " + line.strip()) else: content = False for field in standard_fields: if line.startswith(field): content = True crude_list.append(line.strip()) if not content: if len(crude_list) > 0: crude_list[len(crude_list) - 1] += " " + line.strip() else: crude_list.append("content: " + line.strip()) return crude_list
def check_reference(dsl_lookuper, word, article): # Special case for articles in En-En-Longman_DOCE5.dsl text = BeautifulSoup(article, 'html.parser').text if text.startswith(STR_SEE_MAIN_ENTRY): referenced_word = text[len(STR_SEE_MAIN_ENTRY):].strip() logging.info('Detected reference from "%s" to "%s" (LongmanDOCE5)', word, referenced_word) return lookup_word(dsl_lookuper, referenced_word) # Special case for CambridgeAdvancedLearners main_entry_start = article.find(STR_MAIN_ENTRY) if main_entry_start != -1: article_rest = article[main_entry_start + len(STR_MAIN_ENTRY):] match = RE_A_HREF.search(article_rest) if match: referenced_word = match.group(1) if referenced_word != word: logging.info( 'Detected reference from "%s" to "%s" (CambridgeAdvancedLearners)', word, referenced_word) more_article, more_examples = lookup_word( dsl_lookuper, referenced_word) return article + more_article, more_examples # Special case for LingvoUniversal if len(text) < SHORT_ARTICLE_LENGTH: match = RE_SHORT_REFERENCE.search(text) if match: referenced_word = match.group(1) if word == referenced_word: logging.warning( 'Self reference from "%s" to "%s", skipping (LingvoUniversal)', word, referenced_word) else: logging.info( 'Detected reference from "%s" to "%s" (LingvoUniversal)', word, referenced_word) return lookup_word(dsl_lookuper, referenced_word) # Special case for En-En_American_Heritage_Dictionary.dsl match = RE_SEE_OTHER.search(text) if match: referenced_word = match.group(1) if referenced_word != word: logging.info( 'Detected reference from "%s" to "%s" (AmericanHeritageDictionary)', word, referenced_word) return lookup_word(dsl_lookuper, referenced_word) return article, None
def downloadJacket(self, filename): """Downloads the book's jacket image. Args: filename (str): The name of the file to save the image to. """ with open('{}.png'.format(filename), 'wb') as jacket: image_url = BeautifulSoup( requests.get( 'https://sfpl.bibliocommons.com/item/show/{}'.format( self._id)).text, 'lxml').find(class_='jacketCover bib_detail')['src'] jacket.write( requests.get(image_url if image_url.startswith('http') else 'https:{}'.format(image_url)).content)
def build_header_body(h, b, lineList): """ Merges refering lines into the header. """ if len(lineList) > 0: firstLineText = BeautifulSoup(lineList[0]).get_text().strip() if firstLineText.startswith(("siehe", "mit siehe", "vgl.")): h += lineList[0] lineList = lineList[1:] h, b = build_header_body(h, b, lineList) else: b = "<br>".join(lineList) return (h, b)
def build_header_body(h, b, lineList): ''' Merges refering lines into the header. ''' if len(lineList) > 0: firstLineText=BeautifulSoup(lineList[0]).get_text().strip() if firstLineText.startswith(('siehe', 'mit siehe', 'vgl.')): h += lineList[0] lineList = lineList[1:] h, b = build_header_body(h, b, lineList) else: b = '<br>'.join(lineList) return (h, b)
def check_reference(dsl_lookuper, word, article): # Special case for articles in En-En-Longman_DOCE5.dsl text = BeautifulSoup(article, 'html.parser').text if text.startswith(STR_SEE_MAIN_ENTRY): referenced_word = text[len(STR_SEE_MAIN_ENTRY):].strip() logging.info('Detected reference from "%s" to "%s" (LongmanDOCE5)', word, referenced_word) return lookup_word(dsl_lookuper, referenced_word) # Special case for CambridgeAdvancedLearners main_entry_start = article.find(STR_MAIN_ENTRY) if main_entry_start != -1: article_rest = article[main_entry_start + len(STR_MAIN_ENTRY):] match = RE_A_HREF.search(article_rest) if match: referenced_word = match.group(1) if referenced_word != word: logging.info('Detected reference from "%s" to "%s" (CambridgeAdvancedLearners)', word, referenced_word) more_article, more_examples = lookup_word(dsl_lookuper, referenced_word) return article + more_article, more_examples # Special case for LingvoUniversal if len(text) < SHORT_ARTICLE_LENGTH: match = RE_SHORT_REFERENCE.search(text) if match: referenced_word = match.group(1) if word == referenced_word: logging.warning('Self reference from "%s" to "%s", skipping (LingvoUniversal)', word, referenced_word) else: logging.info('Detected reference from "%s" to "%s" (LingvoUniversal)', word, referenced_word) return lookup_word(dsl_lookuper, referenced_word) # Special case for En-En_American_Heritage_Dictionary.dsl match = RE_SEE_OTHER.search(text) if match: referenced_word = match.group(1) if referenced_word != word: logging.info('Detected reference from "%s" to "%s" (AmericanHeritageDictionary)', word, referenced_word) return lookup_word(dsl_lookuper, referenced_word) return article, None
def latex_with_markup(self, string): string = BeautifulSoup(string, "lxml").text string = string.replace("\\", "\\\\") string = re.sub("\{(.*?)\}", r"≾\1≿", string) string = regex.sub( r"(([\p{IsLatin}\d\:\,\–\-\.]+\s*)+)", r"\\textenglish{\1}", string ) string = string.replace(" }", "} ") string = re.sub("\[(.*?)\]", r"\\footnote{\1}", string) string = re.sub(r"≾(.*?)≿", r"\\footnote{i.e. \1}", string) # string = re.sub('\{(.*?)\}', r'\\footnote{i.e. \1}', string); # string = re.sub('\{(.*?)\}', r'\\footnote{i.e. \1}', string) # string = string.replace('؟','') string = self.latex_punctuation(string) if string.startswith("\\footnote"): string = "\\textenglish{[See footnote.]}" + string return string
def save_Q_A(): '''将humor_Q_A文件中问题与回答转换格式后写入本地''' qa_list = [] with codecs.open('./data/humor_Q_A_old_version2.txt', encoding='utf-8') as f: for line in f.readlines(): json_line = json.loads(line) Q = BeautifulSoup(json_line['Q']).text.strip() A_str = BeautifulSoup(json_line['A']).text.strip() if A_str.startswith('<'): A_str = A_str.replace('<', '<').replace('>', '>') A = 'A:' + '<br>A:'.join(re.split('\n\n+', A_str)) # print Q # print A Q_A_str = '<table><tr><td><font color="#4EABF9"><u>%s</u></font><br>%s</td></tr></table>\n' % (Q, A) qa_list.append(Q_A_str) #发送至邮箱 # mail_to = "*****@*****.**" # send_to_163_mail(''.join(qa_list), mail_to) #写入到本地 codecs.open('Q_A.txt', mode='wb', encoding='utf-8').writelines(qa_list)
def save_Q_A(): '''将humor_Q_A文件中问题与回答转换格式后写入本地''' qa_list = [] with codecs.open('./data/humor_Q_A_old_version2.txt', encoding='utf-8') as f: for line in f.readlines(): json_line = json.loads(line) Q = BeautifulSoup(json_line['Q']).text.strip() A_str = BeautifulSoup(json_line['A']).text.strip() if A_str.startswith('<'): A_str = A_str.replace('<', '<').replace('>', '>') A = 'A:' + '<br>A:'.join(re.split('\n\n+', A_str)) # print Q # print A Q_A_str = '<table><tr><td><font color="#4EABF9"><u>%s</u></font><br>%s</td></tr></table>\n' % ( Q, A) qa_list.append(Q_A_str) #发送至邮箱 # mail_to = "*****@*****.**" # send_to_163_mail(''.join(qa_list), mail_to) #写入到本地 codecs.open('Q_A.txt', mode='wb', encoding='utf-8').writelines(qa_list)
def play(item): logger.info() itemlist = list() doo_url = "%swp-admin/admin-ajax.php" % host data = httptools.downloadpage(doo_url, post=item.post, headers={ "referer": item.ref }).data try: url = BeautifulSoup(data, "html5lib").find("iframe")["src"] except: return if not url.startswith("http"): url = "https:%s" % url itemlist.append(item.clone(url=url, server='')) itemlist = servertools.get_servers_itemlist(itemlist) return itemlist
def sectionalize_8k(db, url, file_name): global total_files_processe global total_files_passed global total_files_failed global total_files_no_data global total_exibit_passed global total_exibit_failed global total_exibit_no_data #Opening file and reading the content of the file fp = open(url, "r", encoding="utf-8") webContent = fp.readlines() ind = 0 #items_8K list stores all information about 8-K items item_no_list = [] # list of item number title_list = [] # list of title item_text_list = [] # list of item text exhibit_no_list = [ ] # exhibit_no list stores all exhibit numbers of the particular form exhibit_text_list = [] # description of the corresponding exhibit numbers is_exhibit = False try: while (True): #parsing html text from particular index of the html file html_line = BeautifulSoup(webContent[ind], "html.parser").text.strip().lower() #exit condition i.e. if text contains "signature", then exit from the loop as we dont require any information #beyond the signature text if html_line.startswith("signature"): break #incrementing index to fetch the line of that index from html file in next iteration ind += 1 #below block of statements retrieve the text which is starting from "item" keyword if html_line is not "" and html_line.startswith("item"): #item_list list stores each item information and their exhibit_no list item_list = [] temp_ind = ind - 1 item_no = "" #stores item number title = "" #stores title of the item number item_text = "" #stores text under the item number temp = BeautifulSoup(webContent[temp_ind], "html.parser").text.strip() ''' Below code checks for Item Number and its title. First 'If' checks if html line has item no pattern and if it is found, it checks for its descriptin. 'Else checks html line has item no and its description in pattern. ''' #Checks only Item No first, then its title in next html lines if re.fullmatch(r'itemÂ*\s\d+\.\d+\.*', temp.lower()): item_no = temp.lower() #Finding the title once item number found for i in range(temp_ind + 1, temp_ind + 10): if (len(webContent) > i): title_html = BeautifulSoup( webContent[i], "html.parser").text.strip() if title_html is not "": title = title_html temp_ind += 1 else: # Checks for Item No with Title in single html line if re.match(r'itemÂ*\s\d+\.\d+\s*\w+\.*', temp.lower()): temp_split = temp.split() #Finding the Title once item number found for t in temp_split: if re.fullmatch(r'\d+\.\d+\.*', t): item_no = "item " + t t_ind = temp_split.index(t) for i in temp_split[t_ind + 1:]: title += i + " " temp_ind += 1 item_no = re.sub('[^a-z0-9.,?!%()$]+', ' ', item_no) title = re.sub('[^A-Za-z0-9.,?!%()$]+', ' ', title) # print("item no:" + item_no) # print("title:" + title) # Below code finds the text below the particular item number for text in webContent[temp_ind + 1:]: temp1 = BeautifulSoup(text, "html.parser").text.strip() #Exit criteria: If next item found or signature text found, it exits from the loop if (temp1.lower().startswith("item") or temp1.lower().startswith("signature")): break if temp1 == title: continue item_text += temp1 #print(temp1) #Checks for the exhibit number if item no is '9.01'. if re.fullmatch(r'item 9.01\.*', item_no) and re.fullmatch( r'\d+\.\d+\.*', temp1): exhibit_no_list.append(temp1) else: #If text contains exhibit number and its description in one line, then split the text and find exhibit number if re.fullmatch(r'item 9.01\.*', item_no) and re.match( r'\d+\.\d+\.*\s*\w+\.*', temp1): temp_split = temp1.split() # Finding the exhibit number for t in temp_split: if re.fullmatch(r'\d+\.\d+\.*', t): exhibit_no_list.append(t) #print(exhibit_no_list) item_text = re.sub('[^A-Za-z0-9.,?!%()$]+', ' ', item_text) #print("item text:"+ item_text) is_exhibit = False #Extracting exhibit descriptipn of corresponding exhibit number if re.fullmatch(r'item 9.01\.*', item_no) and exhibit_no_list: #Looping through all exhibit numbers for i in range(0, len(exhibit_no_list)): ''' Storing length of ith exhibit number and first index of the ith exhibit number to retrieve the description of corresponding exhibit number. The description is captured next index of the exhibit number till the next exhibit number is found. If there is a last element in exhibit number list, then retrieve till the end ''' item_length = len(str(exhibit_no_list[i])) current_index = item_text.index(str( exhibit_no_list[i])) if i == len(exhibit_no_list) - 1: exhibit_text_list.append(item_text[current_index + item_length:]) else: next_index = item_text.index( str(exhibit_no_list[i + 1])) exhibit_text_list.append( item_text[current_index + item_length:next_index]) is_exhibit = True #Storing item_no, title and text in item_list and exhibit number in exhibit_list. #Then store all information in items_8K list. if item_no and title and item_text: item_no_list.append(item_no) title_list.append(title) item_text_list.append(item_text) #Storing 8-k Form details into MongoDB if item_no_list and title_list and item_text_list: #Calculating if exhibit list is empty, then add 1 to total exhibit failed, else add 1 to total exhibit passed. if exhibit_no_list: total_exibit_passed += 1 else: total_exibit_no_data += 1 insertData(db, file_name, item_no_list, title_list, item_text_list, exhibit_no_list, exhibit_text_list) total_files_passed += 1 else: total_files_no_data += 1 log_print_statments("Processed in " + str(url)) except Exception as ex: total_files_failed += 1 if is_exhibit is False: total_exibit_failed += 1 log_print_statments("Exception in " + str(url))
def parse(self, response): """ Scrapes the list of modules associated with Bottle. Causes scrapy to follow the links to the module docs and uses a different parser to extract the API information contained therein. """ # Find all the function definitions on the page: for func in response.css('dl.function'): # Class details are always first items in dl. func_spec = func.css('dt')[0] func_doc = func.css('dd')[0] # Function name is always first dt func_name = BeautifulSoup(func_spec.css('code.descname').\ extract()[0], 'html.parser').text # Args into function args = [] for ems in func_spec.css('em'): args.append(ems.extract().replace('<em>', '').\ replace('</em>', '')) # Function description. soup = BeautifulSoup(func_doc.extract(), 'html.parser') d = self.to_dict(func_name, args, soup.text) if d: yield d # Find all the class definitions on the page: for classes in response.css('dl.class'): # Class details are always first items in dl. class_spec = classes.css('dt')[0] class_doc = classes.css('dd')[0] # Class name is always first dt class_name = BeautifulSoup(class_spec.css('code.descname').\ extract()[0], 'html.parser').text # Args into __init__ init_args = [] for ems in class_spec.css('em'): props = 'property' in ems.css('::attr(class)').extract() if not props: init_args.append(ems.extract().replace('<em>', '').\ replace('</em>', '')) # Class description. Everything up to and including the field-list. soup = BeautifulSoup(class_doc.extract(), 'html.parser') contents = soup.contents[0].contents description = '' for child in contents: if child.name == 'p': description += child.text + '\n\n' if child.name == 'table': raw = child.text rows = [r.strip() for r in raw.split('/n') if r.strip()] description += '\n' description += '\n'.join(rows) break if child.name == 'dl': break d = self.to_dict(class_name, init_args, description) if d: yield d # Remaining dt are methods or attributes for methods in classes.css('dl.method'): # Parse and yield methods. method_name = BeautifulSoup(methods.css('code.descname').\ extract()[0], 'html.parser').text if method_name.startswith('__'): break method_name = class_name + '.' + method_name method_args = [] for ems in methods.css('em'): method_args.append(ems.extract().replace('<em>', '').\ replace('</em>', '')) description = BeautifulSoup(methods.css('dd')[0].extract(), 'html.parser').text d = self.to_dict(method_name, method_args, description) if d: yield d for data in classes.css('dl.attribute'): name = BeautifulSoup(data.css('code.descname').extract()[0], 'html.parser').text name = class_name + '.' + name description = BeautifulSoup(data.css('dd')[0].extract(), 'html.parser').text d = self.to_dict(name, None, description) if d: yield d for data in classes.css('dl.data'): name = BeautifulSoup(data.css('code.descname').extract()[0], 'html.parser').text name = class_name + '.' + name description = BeautifulSoup(data.css('dd')[0].extract(), 'html.parser').text d = self.to_dict(name, None, description) if d: yield d
def getRound(soup, id, gameId, round): div = soup.find('div', id=id) if div: categories = div.find_all('td', class_="category_name") # f = open("soup.txt","w") # f.write(soup.prettify()) # f.close() if len(categories) == 6: categories = list(map(getText, categories)) clueDivs = div.find_all('td', class_='clue') else: categories = soup.find_all('td', class_="category_name") clueDivs = soup.find_all('td', class_='clue') if round == 'Single': categories = categories[:6] clueDivs = clueDivs[:6] elif len(categories) >= 12: categories = categories[6:12] clueDivs = clueDivs[6:12] else: categories = [] clueDivs = [] categories = list(map(getText, categories)) if len(categories) > 0: clues = [[0 for x in range(6)] for y in range(5)] answers = [[0 for x in range(6)] for y in range(5)] extract = re.compile('correct_response">(.*)</em>') row = 0 col = 0 numClues = 0 for square in clueDivs: text = square.find('td', class_='clue_text') if text: clues[row][col] = text.text numClues += 1 answerDiv = square.find('div') if answerDiv: answer = extract.search(str(answerDiv)) pretty = BeautifulSoup(answer.group(1), 'html.parser').text if pretty.startswith('<i>'): pretty = pretty[3:] if pretty.endswith('</i>'): pretty = pretty[:-4] answers[row][col] = pretty col += 1 if col == 6: col = 0 row += 1 for col in range(6): sql = "INSERT INTO Categories (GameId, RoundCode, Name) VALUES (%s, %s, %s)" val = (gameId, round, categories[col]) mycursor.execute(sql, val) mydb.commit() categoryId = mycursor.lastrowid sql = "Insert Into Clues (Categoryid, PointVal, Clue, Answer) Values (%s, %s, %s, %s)" val = [] for row in range(5): val.append((categoryId, row * 200 + 200, clues[row][col], answers[row][col])) mycursor.executemany(sql, val) mydb.commit() print('\t', round, ': ', numClues) else: print('\t', round, ': no clues')
file_as_string = unicode(html_file.read(), errors='ignore') html_file.close() raw_a_tag_values = [] raw_a_tag_values.extend(re.findall(r"(\<a.*?\>)", file_as_string, re.IGNORECASE | re.MULTILINE | re.DOTALL)) if len(raw_a_tag_values) > 0 : print "Operating on " + subdir_string + '/' + cur_tree_location + '/' + cur_tree_file + ":\n" for cur_raw_a_tag_value in raw_a_tag_values : cur_raw_a_tag_value_orig = cur_raw_a_tag_value cur_raw_a_tag_value = cur_raw_a_tag_value.replace("\r", " ").replace("\n", " ") if 'href' in cur_raw_a_tag_value and not '<?' in cur_raw_a_tag_value and not ' $' in cur_raw_a_tag_value and not cur_raw_a_tag_value.count('\\') > 3 and not 'file://' in cur_raw_a_tag_value and not '<area' in cur_raw_a_tag_value : cur_a_href_value = BeautifulSoup(cur_raw_a_tag_value).a['href'] if cur_a_href_value.lower().endswith(media_bins_suffixes): if not cur_a_href_value.startswith(('http', '//')) or cur_a_href_value.startswith(on_eclipse_uri_prefixes): print "Replacing " + cur_raw_a_tag_value if options.auto_process is True: new_filestring = media_server_url + guess_new_imagepath( cur_a_href_value, media_server_url, subdir_string + cur_tree_location ) else: new_filestring = read_input_prefill( 'New img src (Enter nothing to skip) : ', media_server_url + guess_new_imagepath( cur_a_href_value, media_server_url, subdir_string + cur_tree_location )
def parse_api(self, response): """ Parses a *potential* API documentation page. """ # Find all the function definitions on the page: for func in response.css('dl.function'): # Class details are always first items in dl. func_spec = func.css('dt')[0] func_doc = func.css('dd')[0] # Function name is always first dt fn1 = BeautifulSoup(func_spec.css('code.descclassname').\ extract()[0], 'html.parser').text fn2 = BeautifulSoup(func_spec.css('code.descname').extract()[0], 'html.parser').text func_name = fn1 + fn2 # Args into function args = [] for ems in func_spec.css('em'): args.append(ems.extract().replace('<em>', '').\ replace('</em>', '')) # Function description. soup = BeautifulSoup(func_doc.extract(), 'html.parser') d = self.to_dict(func_name, args, soup.text) if d: yield d # Find all the class definitions on the page: for classes in response.css('dl.class'): # Class details are always first items in dl. class_spec = classes.css('dt')[0] class_doc = classes.css('dd')[0] # Class name is always first dt cn1 = BeautifulSoup(class_spec.css('code.descclassname').\ extract()[0], 'html.parser').text cn2 = BeautifulSoup(class_spec.css('code.descname').extract()[0], 'html.parser').text class_name = cn1 + cn2 # Args into __init__ init_args = [] for ems in class_spec.css('em'): props = 'property' in ems.css('::attr(class)').extract() if not props: init_args.append(ems.extract().replace('<em>', '').\ replace('</em>', '')) # Class description. Everything up to and including the field-list. soup = BeautifulSoup(class_doc.extract(), 'html.parser') contents = soup.contents[0].contents description = '' for child in contents: if child.name == 'p': description += child.text + '\n\n' if child.name == 'table': raw = child.text rows = [r.strip() for r in raw.split('/n') if r.strip()] description += '\n' description += '\n'.join(rows) break if child.name == 'dl': break d = self.to_dict(class_name, init_args, description) if d: yield d # Remaining dt are methods or attributes for methods in classes.css('dl.method'): # Parse and yield methods. method_name = BeautifulSoup(methods.css('code.descname').\ extract()[0], 'html.parser').text if method_name.startswith('__'): break method_name = class_name + '.' + method_name method_args = [] for ems in methods.css('em'): method_args.append(ems.extract().replace('<em>', '').\ replace('</em>', '')) description = BeautifulSoup(methods.css('dd')[0].extract(), 'html.parser').text d = self.to_dict(method_name, method_args, description) if d: yield d for data in classes.css('dl.attribute'): name = BeautifulSoup(data.css('code.descname').extract()[0], 'html.parser').text name = class_name + '.' + name description = BeautifulSoup(data.css('dd')[0].extract(), 'html.parser').text d = self.to_dict(name, None, description) if d: yield d for data in classes.css('dl.data'): name = BeautifulSoup(data.css('code.descname').extract()[0], 'html.parser').text name = class_name + '.' + name description = BeautifulSoup(data.css('dd')[0].extract(), 'html.parser').text d = self.to_dict(name, None, description) if d: yield d
def descargarCategoriaEspecifica22(URLLL, resultados): resultado = descargarResultado("/producto/" + URLLL , 360, 10); try: codigo = URLLL except: codigo = '' try: nombre = resultado.split('<h2 class="with-tabs">')[1].split('</h2>')[0].replace("\\t",'').strip() except: nombre = '' try: categoria = resultado.split('<b>Categor')[1].split('</div>')[0].split('</b>')[1].replace("\\t",'').replace("\\n",'').strip() except: categoria = '' try: costo = resultado.split('class="uc-price">')[2].split('<')[0].replace("\\t",'').strip() except: costo = '' try: fotos = 'http://www.radec.com.mx/sites/all/files/productos/' + codigo + '.jpg'; except: fotos = '' val = 0; nombre2 = nombre try: for car in resultado.split("/sites/all/themes/radec/images/car_icon.gif"): marca = '' marca_auto = '' modelo = '' anio = '' notas = '' if (val == 0): val = 1; else: try: marca_auto = car.split('<')[0].split('>')[1].replace("\\t",'').split('\\n')[2].strip() except: marca_auto = '' try: marca = '' if (' TYC ' in nombre): marca = 'TYC' if ( ' DEPO ' in nombre): marca = 'DEPO' except: marca = '' try: modelo = car.split('<')[0].split('>')[1].replace("\\t",'').split('\\n')[3].strip() except: modelo = '' anio = car.split('<')[0].split('>')[1].replace("\\t",'').split('\\n')[5].strip() if (anio != 'ALL YEARS'): anioOrigin2 = '#'+anio; anioOrigin = anioOrigin2.replace('#20','').replace('#19','').replace('-20','-').replace('-19','-') anioList = []; if ('-' in anio): anioInicio = int(anio.split('-')[0]) anioFin = int(anio.split('-')[1] ) while (anioInicio <= anioFin): anioList.append(str(anioInicio)) anioInicio = anioInicio + 1; anio = ' '.join(anioList) + ' ' if (len(anioList) < 5): nombre = nombre.replace(anioOrigin,anio); else: nombre = nombre.replace(anioOrigin,anioOrigin2.replace('#','').replace('-',' a ')); try: notas = resultado.split('<b>Aplicaciones:</b>')[1].split('</div>')[0].replace("\\t",'').replace("\\n",'').replace('<br/>',' - ') notas = BeautifulSoup(notas, 'html.parser').text; while (" " in notas): notas = notas.replace(' ',' '); if (notas.startswith(' - ')): notas = notas.replace(" - ", "", 1) if (notas.endswith(' - ')): notas = rreplace(notas," - ", "", 1); except: notas = '' nombre= nombre.replace(' FD ', ' FORD ').replace(' CV ', ' CHEVROLET ').replace(' TY ', ' TOYOTA ').replace(' AD ', ' AUDI ').replace(' BK ', ' BUICK ').replace(' MC ', ' MERCEDES BENZ ').replace(' ST ', ' SEAT ').replace(' VW ', ' VOLKSWAGEN ').replace(' KI ', ' KIA ').replace(' NS ', ' NISSAN ').replace(' HD ', ' HONDA ').replace(' SN ',' SATURN ').replace(' JP ', ' JEEP ').replace(' AC ', ' ACURA ').replace(' DG ', ' DODGE ').replace(' PT ',' PONTIAC ').replace(' BW ', ' BMW ').replace(' CR ', ' CHRYSLER ').replace(' MT ', ' MITSUBISHI ').replace(' PG ',' PEUGEOT ').replace(' UNIV ', ' UNIVERSAL ').replace(' CR ', ' CHRYSLER ').replace(' MT ', ' MITSUBISHI ').replace(' PG ',' PEUGEOT ') nombre= nombre.replace(' JGO ', ' JUEGO ').replace(' CD ', ' CADILLAC ') resultados.append('"'+codigo+'","'+nombre +'","'+ marca +'","'+ marca_auto +'","'+ categoria +'","'+costo +'","' + modelo +'","'+ fotos+'","'+ anio +'","'+ notas +'"'); except Exception as e: print('FALLO ---- > ' + URLLL) resultados.append('"'+URLLL+'"'); return;
sjClues = [[0 for x in range(6)] for y in range(5)] djClues = [[0 for x in range(6)] for y in range(5)] sjAnswers = [[0 for x in range(6)] for y in range(5)] djAnswers = [[0 for x in range(6)] for y in range(5)] row = 0 col = 0 for square in sjDivs: text = square.find('td', class_='clue_text') if text: sjClues[row][col] = text.text answerDiv = square.find('div') if answerDiv: answer = extract.search(str(answerDiv)) pretty = BeautifulSoup(answer.group(1), 'html.parser').text if pretty.startswith('<i>'): pretty = pretty[3:] if pretty.endswith('</i>'): pretty = pretty[:-4] sjAnswers[row][col] = pretty col += 1 if col == 6: col = 0 row += 1 row = 0 col = 0 for square in djDivs: text = square.find('td', class_='clue_text') if text: djClues[row][col] = text.text
def parse_api(self, response): """ Parses a *potential* API documentation page. """ # Find all the function definitions on the page: for func in response.css("dl.function"): # Class details are always first items in dl. func_spec = func.css("dt")[0] func_doc = func.css("dd")[0] # Function name is always first dt fn1 = BeautifulSoup( func_spec.css("code.descclassname").extract()[0], "html.parser" ).text fn2 = BeautifulSoup( func_spec.css("code.descname").extract()[0], "html.parser" ).text func_name = fn1 + fn2 # Args into function args = [] for ems in func_spec.css("em"): args.append( ems.extract().replace("<em>", "").replace("</em>", "") ) # Function description. soup = BeautifulSoup(func_doc.extract(), "html.parser") d = self.to_dict(func_name, args, soup.text) if d: yield d # Find all the class definitions on the page: for classes in response.css("dl.class"): # Class details are always first items in dl. class_spec = classes.css("dt")[0] class_doc = classes.css("dd")[0] # Class name is always first dt cn1 = BeautifulSoup( class_spec.css("code.descclassname").extract()[0], "html.parser", ).text cn2 = BeautifulSoup( class_spec.css("code.descname").extract()[0], "html.parser" ).text class_name = cn1 + cn2 # Args into __init__ init_args = [] for ems in class_spec.css("em"): props = "property" in ems.css("::attr(class)").extract() if not props: init_args.append( ems.extract().replace("<em>", "").replace("</em>", "") ) # Class description. Everything up to and including the field-list. soup = BeautifulSoup(class_doc.extract(), "html.parser") contents = soup.contents[0].contents description = "" for child in contents: if child.name == "p": description += child.text + "\n\n" if child.name == "table": raw = child.text rows = [r.strip() for r in raw.split("/n") if r.strip()] description += "\n" description += "\n".join(rows) break if child.name == "dl": break d = self.to_dict(class_name, init_args, description) if d: yield d # Remaining dt are methods or attributes for methods in classes.css("dl.method"): # Parse and yield methods. method_name = BeautifulSoup( methods.css("code.descname").extract()[0], "html.parser" ).text if method_name.startswith("__"): break method_name = class_name + "." + method_name method_args = [] for ems in methods.css("em"): method_args.append( ems.extract().replace("<em>", "").replace("</em>", "") ) description = BeautifulSoup( methods.css("dd")[0].extract(), "html.parser" ).text d = self.to_dict(method_name, method_args, description) if d: yield d for data in classes.css("dl.attribute"): name = BeautifulSoup( data.css("code.descname").extract()[0], "html.parser" ).text name = class_name + "." + name description = BeautifulSoup( data.css("dd")[0].extract(), "html.parser" ).text d = self.to_dict(name, None, description) if d: yield d for data in classes.css("dl.data"): name = BeautifulSoup( data.css("code.descname").extract()[0], "html.parser" ).text name = class_name + "." + name description = BeautifulSoup( data.css("dd")[0].extract(), "html.parser" ).text d = self.to_dict(name, None, description) if d: yield d
def findvideos(item): logger.info() itemlist = list() sub = "" soup = create_soup(item.url) matches = soup.find("div", class_="navEP2") if not matches: return itemlist for elem in matches.find_all("li", class_="dooplay_player_option"): post = { "action": "doo_player_ajax", "post": elem["data-post"], "nume": elem["data-nume"], "type": elem["data-type"] } headers = {"Referer": item.url} doo_url = "%swp-admin/admin-ajax.php" % host data = httptools.downloadpage(doo_url, post=post, headers=headers).data if not data: continue player_url = BeautifulSoup(data, "html5lib").find("iframe")["src"] player_url = player_url.replace("https://animekao.club/video/", "https://kaocentro.net/video/") if not player_url.startswith( "https://re.") and not player_url.startswith( "https://kaocentro.net/video/"): url = process_url(player_url) if not url: continue itemlist.append( Item(channel=item.channel, title='%s', action='play', url=url, language="LAT", infoLabels=item.infoLabels, subtitle=sub)) else: player = httptools.downloadpage(player_url, headers={ "referer": item.url }).data soup = BeautifulSoup(player, "html5lib") if soup.find("div", id="ErrorWin"): continue matches = soup.find_all("li", {"onclick": True}) lang_data = soup.find("li", class_="SLD_A") if lang_data.has_attr("data-lang"): lang = lang_data.get("data-lang", "2") else: lang = scrapertools.find_single_match( lang_data.get("onclick", ""), "this, '([^']+)'") for elem in matches: if not elem.has_attr("data-r"): url = scrapertools.find_single_match( elem.get("onclick", ""), "go_to_player\('([^']+)") else: url = base64.b64decode(elem["data-r"]).decode('utf-8') if not url or "short." in url: continue url = process_url(url) if not url: continue itemlist.append( Item(channel=item.channel, title='%s', action='play', url=url, language=IDIOMAS.get(lang, "VOSE"), infoLabels=item.infoLabels, subtitle=sub)) itemlist = servertools.get_servers_itemlist( itemlist, lambda x: x.title % x.server.capitalize()) # Requerido para FilterTools itemlist = filtertools.get_links(itemlist, item, list_language) # Requerido para AutoPlay autoplay.start(itemlist, item) if config.get_videolibrary_support( ) and len(itemlist) > 0 and item.extra != 'findvideos': itemlist.append( Item(channel=item.channel, title= '[COLOR yellow]Añadir esta pelicula a la videoteca[/COLOR]', url=item.url, action="add_pelicula_to_library", extra="findvideos", contentTitle=item.contentTitle)) return itemlist
'GET', 'scheme': 'https', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Connection': 'Keep-Alive', 'cache-control': 'max-age=0', 'Range': 'bytes=0-100000', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36' } url_htm = s.get(url, headers=headers) soup = BeautifulSoup(url_htm.text, "html.parser") print("hello.") for line in soup.findAll('div', {'class:', 'e_col w4_5'}): soup2 = BeautifulSoup(str(line)) for line2 in soup2.findAll('div', {'class:', 'title '}): q_url = BeautifulSoup(str(line2)).find('a').get('href') print(q_url) if q_url.startswith('/unanswered'): continue else: questionslinks.append('https://www.quora.com' + q_url) with open('questionslinks.txt', 'w', encoding='utf-8') as f: for i in range(len(questionslinks)): f.write(questionslinks[i] + '\n')
for line in infile: if '![' in line: line = re.sub(r'(?!\[.*)\]\((?!http)', '](img/', line) else: for src, target in replacements.items(): line = line.replace(src, target) # if 'youtube.com/embed/' in line: # youtubeString = re.findall(r'(www\.youtube\.com/embed/\S*)"', line) # youtubeStringStringed = youtubeString[0] # youtubeStringFixed = 'https//' + youtubeStringStringed # youtubeStringFixed = re.sub(r'embed/', 'watch?v=', youtubeStringFixed) # wholeLine = '{% youtube %}' + youtubeStringFixed + '{% endyoutube %}' # line = '\n' + wholeLine + '\n' if line.startswith('Obra publicada con'): line = line.replace(line, '') if '![' in line: line = '\n' + line + '\n' outfile.write(line) infile.close() outfile.close() os.remove(file) os.rename(tmp_file, file_name) pasado_a_repo = 'No' while pasado_a_repo != 'Sí': pasado_a_repo = input('¿Has creado el repo en github? ')