def test_margin_before(): html = '<html><body><p>first</p></body></html>' assert get_text(html) == 'first' html = '<html><body>first<p>' \ 'second</p></body></html>' assert get_text(html) == 'first\nsecond'
def test_successive_a(): html = '<html><body><a href="first">first</a>' \ '<a href="second">second</a></body></html>' assert get_text(html) == 'firstsecond' html = '<html><body><a href="first">first</a>\n' \ '<a href="second">second</a></body></html>' assert get_text(html) == 'first second'
def test_table_cell_separator(): html = '<html><body><table><tr><td>Hallo<br>Eins</td><td>Echo<br>Zwei</td></tr></table></html>' config = ParserConfig() assert get_text(html, config) == 'Hallo Echo\nEins Zwei\n' config = ParserConfig(table_cell_separator='\t') assert get_text(html, config) == 'Hallo\tEcho\nEins \tZwei\n'
def test_forgotten_td_close_tag(): # one line (i.e., missing </td> before the next <td> and the next </tr> html = (u'<body>hallo<table>' '<tr><td>1<td>2</tr>' u'</table>echo</body>') assert get_text(html, config) == u'hallo\n1 2\necho' # two lines (i.e. missing </td> before the <tr> and before the </table> html = (u'<body>hallo<table>' '<tr><td>1<td>2' '<tr><td>3<td>4' u'</table>echo</body>') assert get_text(html, config) == u'hallo\n1 2\n3 4\necho'
def test_divs(): html = u'<body>Thomas<div>Anton</div>Maria</body>' assert get_text(html) == u'Thomas\nAnton\nMaria' html = u'<body>Thomas<div>Anna <b>läuft</b> weit weg.</div>' assert get_text(html) == u'Thomas\nAnna läuft weit weg.' html = u'<body>Thomas <ul><li><div>Anton</div>Maria</ul></body>' assert get_text(html) == u'Thomas\n * Anton\n Maria' html = u'<body>Thomas <ul><li> <div>Anton</div>Maria</ul></body>' assert get_text(html) == u'Thomas\n * Anton\n Maria' html = u'<body>Thomas <ul><li> a <div>Anton</div>Maria</ul></body>' assert get_text(html) == u'Thomas\n * a\n Anton\n Maria'
def search_in_page(): soup = BeautifulSoup(URL, "lxml") Data_search_1 = soup.find("p", class_="short") Data_search_2 = soup.find("p", class_="long") get_text_1 = get_text(str(Data_search_1)) get_text_2 = get_text(str(Data_search_2)) word_align = word.center(int(columns)) text_ir = "-------------- < Translate IR > ---------------" end_command = "-----------------------------------------------------------" end_command = end_command.center(int(columns)) msg_x = text_ir.center(int(columns)) print( colored(get_text_1, "green"),"\n\n", colored(get_text_2, "green")) print("\n\n",colored(msg_x, "red"),"\n\n") print(colored(word_align,"green"), "\n") print(colored(end_command,"red"))
def test_tail(): """ ensure that the tail elements are formated based on the container element. """ html = (u'<body>Hi<span style="white-space: pre"> 1 3 </span>' u' versus 1 3') assert get_text(html, config) == u'Hi 1 3 versus 1 3'
def preprocessFiles(rootInputPath, outputPath, forceRecreate=False, debugInfo=False): outputFilePath = f"{outputPath}/processed.json" if os.path.isfile(outputFilePath) and not forceRecreate: with open(outputFilePath, "r") as file: return json.load(file) print("Preprocessing...") processed = {} inputPaths = [ f for f in glob.glob(rootInputPath + "**/*.html", recursive=True) ] for idx, documentPath in enumerate(inputPaths): if debugInfo: print( f"[{(idx / len(inputPaths)) * 100:.0f}%] Working on {documentPath}" ) with open(documentPath, mode='r', encoding="utf-8") as file: text = re.sub(r'<[^<]+?>', '', get_text(file.read().lower())) processed[re.compile(r'.*/(.*).html').search( documentPath).group(1)] = { "tokens": Preprocess.tokenize(text), "content": Preprocess.contentize(text) } with open(outputFilePath, mode="w", encoding="utf-8") as file: json.dump(processed, file, ensure_ascii=False) return processed
def readDocs(self): """ Read all documents of the path and set them in the Docs list """ DataPathList = glob.glob(self.pathDocs + '*.html') DataPathList.sort() self.docs = [] h = 0 for docPath in DataPathList: f = codecs.open(docPath, 'r') text = get_text(f.read()) self.docs.append({ 'id': str(h), 'text': self.removePonctuation(re.split('\s|\n', text)) }) print(re.split('\s|\n', text)) # print(re.split('\s|, |\*|\n',text)) h += 1
def fetch_result(term): url = get_url(term) html = urllib.request.urlopen(url).read().decode('utf-8') text = get_text(html) with open("result_of_search.txt", "w") as f: f.write(text) return url
def html2text(raw_html): title_search = re.search(EXTRACT_TITLE, raw_html) if title_search is not None: title = title_search.groups()[0] else: title = '' raw_html = raw_html.replace("<", " <").replace(">", "> ") text_inscr = get_text(raw_html) text_bs = bs4_text_from_html(raw_html) text = '' if len(text_inscr) == 0: text = text_bs else: text = text_inscr # # if TEXT_EXTRACTOR_TYPE in ['inscriptis']: # text = get_text(raw_html) # else: # text = bs4_text_from_html(raw_html) return {'text': text, 'title': title}
def get_text_content(url): """ return all text content from url """ req = Request(url, headers={'User-Agent': 'Mozilla/75.0'}) uvcontext = ssl._create_unverified_context() webpage = urlopen(req, context=uvcontext).read().decode('utf-8') return get_text(webpage)
def test_html_snippets(filter_str=''): for testcase_txt in glob(TESTCASE_PATTERN): if filter_str not in testcase_txt: continue with open(testcase_txt) as f: reference_txt = f.read().rstrip() with open(testcase_txt.replace('.txt', '.html')) as f: print(f.name) html = '<html><body>{}</body></html>'.format(f.read()) converted_txt = get_text( html, ParserConfig(css=CSS_PROFILES['strict'])).rstrip() if converted_txt != reference_txt: print('File:{}\nHTML:\n{}\n\nReference:\n{}\n\nConverted:\n{}'. format(testcase_txt, html, reference_txt, converted_txt)) print('HTML file:', testcase_txt.replace('.txt', '.html')) print("Visualize differences with `vimdiff reference.txt " "converted.txt`") open("reference.txt", "w").write(reference_txt) open("converted.txt", "w").write(converted_txt) assert converted_txt == reference_txt
def main(): url = 'http://www.ieee.org/conferences_events/conferences/search/index.html?KEYWORDS=&CONF_SRCH_RDO=conf_date&RANGE_FROM_DATE=&RANGE_TO_DATE=®ION=Region10-Asia+and+Pacific&COUNTRY=Bangladesh&RowsPerPage=10&PageLinkNum=10&ActivePage=1&SORTORDER=desc&SORTFIELD=start_date' content = urlopen(url) soup = BeautifulSoup(content, 'lxml') conference_table = soup.findChildren('table', class_='nogrid-nopad') rows = conference_table[0].findChildren('td', class_='pad10') events = [] for row in rows: event = row.find_all('p') for info in event: events.append(get_text(str(info))) label = [ "Event title: ", "Date of Submissions:", "Event Date:", "Event Location:" ] extra_decoration = 0 print("*" * 60, "\n") for lab, event in zip(label * len(events), events): print(lab, event, end="\n") extra_decoration += 1 if extra_decoration == 4: print("\n", "*" * 60, "\n") extra_decoration = 0
def parse(self, response): page = response.url.split("/")[-2] filename = 'C:\\Users\\Prashant Mishra\\PycharmProjects\\VeQuest\\blogs\\Page-%s.txt' % page resp = response.css("body").extract() output = get_text(resp[0]) with open(filename, 'wb') as f: f.write(output.encode())
def scraping(url): """ Fonction renvoyant une liste contenant un tuple (mot, compte) des 10 mots les plus utilisés sur la page internet indiquée et qui ne sont pas dans la liste d'exceptions. argument: - url: adresse url de la page internet où récupérer les mots """ html = urllib.request.urlopen(url).read().decode( 'utf-8') #ouverture de la requete HTTP text = get_text( html ) #création de la chaine de caractère à partir de laquelle on extrait les mots cnt = Counter() #initialisation du Counter qui contiendra les mots exceptions = [ 'le', 'la', 'les', 'un', 'une', 'des', 'du', 'de', 'au', 'aux', 'o', 'qui', 'que', 'quoi', 'et', 'pour', '*', '+', 'vous', 'notre', 'nos', '?', 'en', ':', 'vos', 'votre', 'sur', 'à', 'avec', 'dans', 'nous', 'Nous', 'leur', 'Vous', 'y', 'Comment', 'En', 'plus', 'Nos', 'ici', 'the', 'a', 'to', '+' ] # Liste personnelle permettant de ne pas prendre en compte des motes qui n'apportent peu ou pas d'information words = text.split( ) # creation d'une liste contenant les mots du texte original cnt = Counter(words) # Création du Counter for i in exceptions: #Boucle permettant d'ignorer les exceptions (leur compte devient 0) for ic in cnt: if ic == i: cnt[ic] = 0 return cnt.most_common(10)
def worker(thread_num, urls): url_text = [] thread_file = open(str(thread_num) + ".dat", "a+") for (idx, url) in urls: print(thread_num, idx, url) try: html = urllib.urlopen(url).read().decode("utf8") text = get_text(html) except Exception as e: print("Error: " + str(e)) url_text.append((idx, "")) continue # break into lines and remove leading and trailing space on each lines = (line.strip() for line in text.splitlines()) # break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # drop blank lines chunks = [chunk for chunk in chunks if chunk] chunks = filter(lambda line: len(line.split(" ")) >= 8, chunks) #Remove lines with html tags chunks = filter(lambda word: "<" not in word, chunks) #Remove alphanumeric characters and then remove lines with html tags chunks = map(lambda word: non_alpha_regex.sub(" ", word), chunks) thread_file.write("|".join([str(idx), "\n".join(chunks)])) thread_file.write("##") thread_file.close()
def get_content(self, html_name, mode): f = open(html_name) url = f.readline()[:-1] if (os.fstat(f.fileno()).st_size > self.max_size): print('B') f.close() return url, [[], []] html_rest = f.read() if mode == 'BS': soup = BeautifulSoup(html_rest, "html.parser") [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])] html_text = soup.get_text() headers = self.get_headers(html_rest) nolem = self.clean(html_text) lem = self.lemmatize(nolem) val = [lem, headers] else: extractor = Extractor(extractor='DefaultExtractor', html=html_rest) html_text = extractor.getText() nolem = self.clean(html_text) lem = self.lemmatize(nolem) if len(lem) < self.min_lems: html_text = get_text(html_rest) html_text = self.get_outer_fields(html_text) nolem = self.clean(html_text) lem = self.lemmatize(nolem) val = [lem] f.close() return url, val
def run_inscriptis(htmlstring): '''try with the inscriptis module''' try: text = get_text(htmlstring) except TypeError: text = '' return text # sanitize(text)
def get_web_text(url): print("getting web text for %s" % url) try: hdr = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8', 'Connection': 'keep-alive' } req = urllib.request.Request(url) req.add_header( 'User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', ) html = urllib.request.urlopen(req).read().decode('utf-8') result = get_text(html) result = re.sub('\n', ' ', result) result = re.sub(' +', ' ', result) return result except Exception as e: print(e) return None
def parse(self, response): page = response.url.split("/")[-2] filename = Constants.base_file + '\\blogs\\Page-%s.txt' % page resp = response.css("body").extract() output = get_text(resp[0]) with open(filename, 'wb') as f: f.write(output.encode())
def parse(self, response): # """ # The lines below is a spider contract. For more info see: # http://doc.scrapy.org/en/latest/topics/contracts.html # @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ # @scrapes name # """ # sel = Selector(response) # sites = sel.xpath('//ul[@class="directory-url"]/li') # items = [] item = Website() markup = response.xpath('/html').extract() regex = re.compile(r'[\n\r\t]') content = get_text(regex.sub(" ", markup[0])) item["url"] = response.request.url item["snapshot"] = {} item["snapshot"]["response_url"] = response.url item["snapshot"]["status"] = response.status item["snapshot"]["title"] = response.xpath('/html/head/title/text()').extract_first() item["snapshot"]["content"] = content item["snapshot"]["timestamp"] = datetime.datetime.now().timestamp() return item
def full_html_content(url, user_agent): try: html = create_requests(url, user_agent) text = get_text(html) return text except Exception as ex: print(ex) return ""
def scrap_it(url_name): try: url = url_name html = urllib.request.urlopen(url).read().decode('utf-8') return ' '.join(get_text(html).split()) except ConnectionError: raise Exception('check your network connection!!') except: raise Exception('invalid url !')
def _parse_html(doc_body): title = re.search(TITLE_MATCH, doc_body) if title: title = title.group(1) else: title = '' doc_body = doc_body.replace('<', ' <').replace('>', '> ') return '{}{}{}'.format(title, TITLE_SEP, get_text(doc_body))
def Points(): link = requests.get("http://mydiba.link") url = link.text soup = BeautifulSoup(url, "html.parser") link_movie_pints = soup.find_all('div' , class_="post-infos-index") for point in link_movie_pints: point = str(point) urls_point = get_text(point) sleep(1) print("About the movie\n\n","IMdb",urls_point,"\n\n")
def sabamovie_get_title(PageName, Domain): url = Domain Url_Request = get(url).text Soup = BeautifulSoup(Url_Request, "lxml") Soup_Find = Soup.find_all("h2", class_="title-text") Sting_soup = get_text(str(Soup_Find)) Sting_soup = Sting_soup.replace("[", "") Sting_soup = Sting_soup.replace("]", "") Sting_soup = Sting_soup.replace(",", "") bot.send_message(message.chat.id, Sting_soup)
def test_display_links(): html = '''<html> <body> <a href="first">first</a> <a href="second">second</a> </body> </html> ''' assert get_text(html, display_links=True).strip() == \ '[first](first) [second](second)'
def Browser_artist(): """ serach name artist """ browser_artist_char = args.ser url_radio_javan = "https://www.radiojavan.com/mp3s/browse/artists/"+browser_artist_char url_Browsers = requests.get(url_radio_javan).text soup = BeautifulSoup(url_Browsers, "lxml") data_Browser = soup.find_all("span", class_="artist") for item_browser in data_Browser: get_txt = get_text(str(item_browser)) print(get_txt)
def book_read(book_name): book = open_book(str(book_name)) lines = convert_epub_to_lines(book) print(len(lines)) s = ("\n".join(lines)) #text=re.sub("<[^>]*>","",s text = get_text(s) text = re.sub("[ ]+", " ", text) text = re.sub("[\n]+", "\n", text) return text
def test_html_snippets(filter_str=''): for testcase_txt in glob(TESTCASE_PATTERN): if not filter_str in testcase_txt: continue with open(testcase_txt) as f: reference_txt = f.read().strip() with open(testcase_txt.replace(".txt", ".html")) as f: html = u"<html><body>{}</body></html>".format(f.read()) converted_txt = get_text(html).strip() if converted_txt != reference_txt: print (u"File:{}\nHTML:\n{}\n\nReference:\n{}\n\nConverted:\n{}".format(testcase_txt, html, reference_txt, converted_txt)) assert converted_txt == reference_txt
def test_successive_a(): html = u'<html><body><a href="first">first</a><a href="second">second</a></body></html>' assert get_text(html) == 'firstsecond' html = u'<html><body><a href="first">first</a>\n<a href="second">second</a></body></html>' assert get_text(html) == 'first second'
""" Parses the arguments if script is run directly via console """ parser = argparse.ArgumentParser(description='Converts HTML from file or url to a clean text version') parser.add_argument('input', help='Html input either from a file or an url') parser.add_argument('-o', '--output', type=str, help='Output file (default:stdout).') parser.add_argument('-e', '--encoding', type=str, help='Content encoding for files (default:utf-8)', default='utf-8') parser.add_argument('-i', '--image-captions', action='store_true', default=False, help='Display image captions (default:false).') parser.add_argument('-d', '--deduplicate-image-captions', action='store_true', default=False, help='Deduplicate image captions (default:false).') args = parser.parse_args() return args if __name__ == "__main__": args = get_args() if args.input.startswith("http://") or args.input.startswith("https://"): html_content = urlopen(args.input) else: with open(args.input, encoding=args.encoding) as f: html_content = f.read() text = get_text(html_content, display_images=args.image_captions, deduplicate_captions=args.deduplicate_image_captions) if args.output: with open(args.output, 'w') as open_file: open_file.write(text) else: print(text.encode("utf-8"))
def pipeline(): run_lynx = True run_justext = True run_html2text = True run_beautifulsoup = True run_inscriptis = True # These are a few predefined urls the script will sources = [] with open(os.path.join(benchmarking_root, 'url_list.txt')) as url_list: for line in url_list: sources.append(line.strip()) if not os.path.exists(benchmarking_results_dir): os.makedirs(benchmarking_results_dir) if not os.path.exists(cache_dir): os.makedirs(cache_dir) with open(os.path.join(benchmarking_results_dir, 'speed_comparisons.txt'), 'w') as output_file: output_file.write(u"") for source in sources: source_name = clean_source_name(source) source_cache_path= os.path.join(cache_dir, source_name) if os.path.exists(source_cache_path): html = open(source_cache_path).read() else: try: html = urllib.request.urlopen(source).read().decode("utf-8") except UnicodeDecodeError: html = urllib.request.urlopen(source).read().decode("latin1") open(source_cache_path, 'w').write(html) with open(os.path.join(benchmarking_results_dir, 'speed_comparisons.txt'), 'a') as output_file: output_file.write(u"\nURL: {}\n".format(source_name)) print("\nURL: {}".format(source_name)) times = {} if run_lynx and lynx_available: algorithm = "lynx" start_time = time.time() for n in range(TRIES): data = get_output_lynx(html) stop_time = time.time() times[algorithm] = stop_time - start_time save_to_file(algorithm, source_name, data) if run_justext and justext_available: algorithm = "justext" start_time = time.time() for n in range(TRIES): data = get_output_justext(html) stop_time = time.time() times[algorithm] = stop_time - start_time save_to_file(algorithm, source_name, data) if run_html2text and html2text_available: algorithm = "html2text" start_time = time.time() for n in range(TRIES): data = get_output_html2text(html) stop_time = time.time() times[algorithm] = stop_time - start_time save_to_file(algorithm, source_name, data) if run_beautifulsoup: algorithm = "beautifulsoup" start_time = time.time() for n in range(TRIES): data = get_output_beautifulsoup(html) stop_time = time.time() times[algorithm] = stop_time - start_time save_to_file(algorithm, source_name, data) if run_inscriptis: algorithm = "inscriptis" start_time = time.time() for n in range(TRIES): data = inscriptis.get_text(html) stop_time = time.time() times[algorithm] = stop_time - start_time save_to_file(algorithm, source_name, data) speed_table = get_speed_table(times) print(speed_table) with open(os.path.join(benchmarking_results_dir, 'speed_comparisons.txt'), 'a') as output_file: output_file.write(speed_table + u"\n") with open(os.path.join(benchmarking_results_dir, 'speed_comparisons.txt'), 'a') as output_file: output_file.write(u"\n")