Exemple #1
0
def test_margin_before():
    html = '<html><body><p>first</p></body></html>'
    assert get_text(html) == 'first'

    html = '<html><body>first<p>' \
           'second</p></body></html>'
    assert get_text(html) == 'first\nsecond'
Exemple #2
0
def test_successive_a():
    html = '<html><body><a href="first">first</a>' \
           '<a href="second">second</a></body></html>'
    assert get_text(html) == 'firstsecond'

    html = '<html><body><a href="first">first</a>\n' \
           '<a href="second">second</a></body></html>'
    assert get_text(html) == 'first second'
def test_table_cell_separator():
    html = '<html><body><table><tr><td>Hallo<br>Eins</td><td>Echo<br>Zwei</td></tr></table></html>'

    config = ParserConfig()
    assert get_text(html, config) == 'Hallo  Echo\nEins   Zwei\n'

    config = ParserConfig(table_cell_separator='\t')
    assert get_text(html, config) == 'Hallo\tEcho\nEins \tZwei\n'
Exemple #4
0
def test_forgotten_td_close_tag():
    # one line (i.e., missing </td> before the next <td> and the next </tr>
    html = (u'<body>hallo<table>' '<tr><td>1<td>2</tr>' u'</table>echo</body>')
    assert get_text(html, config) == u'hallo\n1  2\necho'

    # two lines (i.e. missing </td> before the <tr> and before the </table>
    html = (u'<body>hallo<table>'
            '<tr><td>1<td>2'
            '<tr><td>3<td>4'
            u'</table>echo</body>')
    assert get_text(html, config) == u'hallo\n1  2\n3  4\necho'
Exemple #5
0
def test_divs():
    html = u'<body>Thomas<div>Anton</div>Maria</body>'
    assert get_text(html) == u'Thomas\nAnton\nMaria'

    html = u'<body>Thomas<div>Anna <b>läuft</b> weit weg.</div>'
    assert get_text(html) == u'Thomas\nAnna läuft weit weg.'

    html = u'<body>Thomas <ul><li><div>Anton</div>Maria</ul></body>'
    assert get_text(html) == u'Thomas\n  * Anton\n    Maria'

    html = u'<body>Thomas <ul><li>  <div>Anton</div>Maria</ul></body>'
    assert get_text(html) == u'Thomas\n  * Anton\n    Maria'

    html = u'<body>Thomas <ul><li> a  <div>Anton</div>Maria</ul></body>'
    assert get_text(html) == u'Thomas\n  * a\n    Anton\n    Maria'
Exemple #6
0
def test_divs():
    html = u'<body>Thomas<div>Anton</div>Maria</body>'
    assert get_text(html) == u'Thomas\nAnton\nMaria'

    html = u'<body>Thomas<div>Anna <b>läuft</b> weit weg.</div>'
    assert get_text(html) == u'Thomas\nAnna läuft weit weg.'

    html = u'<body>Thomas <ul><li><div>Anton</div>Maria</ul></body>'
    assert get_text(html) == u'Thomas\n  * Anton\n    Maria'

    html = u'<body>Thomas <ul><li>  <div>Anton</div>Maria</ul></body>'
    assert get_text(html) == u'Thomas\n  * Anton\n    Maria'

    html = u'<body>Thomas <ul><li> a  <div>Anton</div>Maria</ul></body>'
    assert get_text(html) == u'Thomas\n  * a\n    Anton\n    Maria'
Exemple #7
0
def search_in_page():
    soup = BeautifulSoup(URL, "lxml")
    Data_search_1 = soup.find("p", class_="short")
    Data_search_2 = soup.find("p", class_="long")
    get_text_1 = get_text(str(Data_search_1))
    get_text_2 = get_text(str(Data_search_2))
    word_align = word.center(int(columns))
    text_ir = "-------------- < Translate IR > ---------------"
    end_command = "-----------------------------------------------------------"
    end_command = end_command.center(int(columns))
    msg_x = text_ir.center(int(columns))
    print( colored(get_text_1, "green"),"\n\n",    colored(get_text_2, "green"))
    print("\n\n",colored(msg_x, "red"),"\n\n")
    print(colored(word_align,"green"), "\n")
    print(colored(end_command,"red"))
def test_tail():
    """
    ensure that the tail elements are formated based on the container element.
    """
    html = (u'<body>Hi<span style="white-space: pre"> 1   3 </span>'
            u' versus 1   3')
    assert get_text(html, config) == u'Hi 1   3  versus 1 3'
Exemple #9
0
 def preprocessFiles(rootInputPath,
                     outputPath,
                     forceRecreate=False,
                     debugInfo=False):
     outputFilePath = f"{outputPath}/processed.json"
     if os.path.isfile(outputFilePath) and not forceRecreate:
         with open(outputFilePath, "r") as file:
             return json.load(file)
     print("Preprocessing...")
     processed = {}
     inputPaths = [
         f for f in glob.glob(rootInputPath + "**/*.html", recursive=True)
     ]
     for idx, documentPath in enumerate(inputPaths):
         if debugInfo:
             print(
                 f"[{(idx / len(inputPaths)) * 100:.0f}%] Working on {documentPath}"
             )
         with open(documentPath, mode='r', encoding="utf-8") as file:
             text = re.sub(r'<[^<]+?>', '', get_text(file.read().lower()))
             processed[re.compile(r'.*/(.*).html').search(
                 documentPath).group(1)] = {
                     "tokens": Preprocess.tokenize(text),
                     "content": Preprocess.contentize(text)
                 }
     with open(outputFilePath, mode="w", encoding="utf-8") as file:
         json.dump(processed, file, ensure_ascii=False)
     return processed
Exemple #10
0
    def readDocs(self):
        """
        Read all documents of the path and set them in the Docs list
        """
        DataPathList = glob.glob(self.pathDocs + '*.html')
        DataPathList.sort()

        self.docs = []

        h = 0
        for docPath in DataPathList:
            f = codecs.open(docPath, 'r')
            text = get_text(f.read())

            self.docs.append({
                'id':
                str(h),
                'text':
                self.removePonctuation(re.split('\s|\n', text))
            })

            print(re.split('\s|\n', text))
            # print(re.split('\s|, |\*|\n',text))

            h += 1
Exemple #11
0
def fetch_result(term):
    url = get_url(term)
    html = urllib.request.urlopen(url).read().decode('utf-8')
    text = get_text(html)
    with open("result_of_search.txt", "w") as f:
        f.write(text)
    return url
Exemple #12
0
def html2text(raw_html):
    title_search = re.search(EXTRACT_TITLE, raw_html)
    if title_search is not None:
        title = title_search.groups()[0]
    else:
        title = ''

    raw_html = raw_html.replace("<", " <").replace(">", "> ")

    text_inscr = get_text(raw_html)
    text_bs = bs4_text_from_html(raw_html)

    text = ''
    if len(text_inscr) == 0:
        text = text_bs
    else:
        text = text_inscr

    #
    # if TEXT_EXTRACTOR_TYPE in ['inscriptis']:
    #     text = get_text(raw_html)
    # else:
    #     text = bs4_text_from_html(raw_html)

    return {'text': text, 'title': title}
Exemple #13
0
def get_text_content(url):
    """ return all text content from url """

    req = Request(url, headers={'User-Agent': 'Mozilla/75.0'})
    uvcontext = ssl._create_unverified_context()
    webpage = urlopen(req, context=uvcontext).read().decode('utf-8')
    return get_text(webpage)
Exemple #14
0
def test_html_snippets(filter_str=''):
    for testcase_txt in glob(TESTCASE_PATTERN):
        if filter_str not in testcase_txt:
            continue

        with open(testcase_txt) as f:
            reference_txt = f.read().rstrip()

        with open(testcase_txt.replace('.txt', '.html')) as f:
            print(f.name)
            html = '<html><body>{}</body></html>'.format(f.read())

        converted_txt = get_text(
            html, ParserConfig(css=CSS_PROFILES['strict'])).rstrip()

        if converted_txt != reference_txt:
            print('File:{}\nHTML:\n{}\n\nReference:\n{}\n\nConverted:\n{}'.
                  format(testcase_txt, html, reference_txt, converted_txt))
            print('HTML file:', testcase_txt.replace('.txt', '.html'))
            print("Visualize differences with `vimdiff reference.txt "
                  "converted.txt`")
            open("reference.txt", "w").write(reference_txt)
            open("converted.txt", "w").write(converted_txt)

        assert converted_txt == reference_txt
def main():
    url = 'http://www.ieee.org/conferences_events/conferences/search/index.html?KEYWORDS=&CONF_SRCH_RDO=conf_date&RANGE_FROM_DATE=&RANGE_TO_DATE=&REGION=Region10-Asia+and+Pacific&COUNTRY=Bangladesh&RowsPerPage=10&PageLinkNum=10&ActivePage=1&SORTORDER=desc&SORTFIELD=start_date'
    content = urlopen(url)
    soup = BeautifulSoup(content, 'lxml')
    conference_table = soup.findChildren('table', class_='nogrid-nopad')
    rows = conference_table[0].findChildren('td', class_='pad10')

    events = []

    for row in rows:
        event = row.find_all('p')
        for info in event:
            events.append(get_text(str(info)))

    label = [
        "Event title: ", "Date of Submissions:", "Event Date:",
        "Event Location:"
    ]

    extra_decoration = 0

    print("*" * 60, "\n")

    for lab, event in zip(label * len(events), events):
        print(lab, event, end="\n")
        extra_decoration += 1

        if extra_decoration == 4:
            print("\n", "*" * 60, "\n")
            extra_decoration = 0
 def parse(self, response):
     page = response.url.split("/")[-2]
     filename = 'C:\\Users\\Prashant Mishra\\PycharmProjects\\VeQuest\\blogs\\Page-%s.txt' % page
     resp = response.css("body").extract()
     output = get_text(resp[0])
     with open(filename, 'wb') as f:
         f.write(output.encode())
def scraping(url):
    """
    Fonction renvoyant une liste contenant un tuple (mot, compte) des 10 mots les plus utilisés sur
    la page internet indiquée et qui ne sont pas dans la liste d'exceptions.
    argument:
        - url: adresse url de la page internet où récupérer les mots
    
    """
    html = urllib.request.urlopen(url).read().decode(
        'utf-8')  #ouverture de la requete HTTP
    text = get_text(
        html
    )  #création de la chaine de caractère à partir de laquelle on extrait les mots
    cnt = Counter()  #initialisation du Counter qui contiendra les mots
    exceptions = [
        'le', 'la', 'les', 'un', 'une', 'des', 'du', 'de', 'au', 'aux', 'o',
        'qui', 'que', 'quoi', 'et', 'pour', '*', '+', 'vous', 'notre', 'nos',
        '?', 'en', ':', 'vos', 'votre', 'sur', 'à', 'avec', 'dans', 'nous',
        'Nous', 'leur', 'Vous', 'y', 'Comment', 'En', 'plus', 'Nos', 'ici',
        'the', 'a', 'to', '+'
    ]
    # Liste personnelle permettant de ne pas prendre en compte des motes qui n'apportent peu ou pas d'information
    words = text.split(
    )  # creation d'une liste contenant les mots du texte original
    cnt = Counter(words)  # Création du Counter
    for i in exceptions:  #Boucle permettant d'ignorer les exceptions (leur compte devient 0)
        for ic in cnt:
            if ic == i:
                cnt[ic] = 0
    return cnt.most_common(10)
Exemple #18
0
def worker(thread_num, urls):
    url_text = []
    thread_file = open(str(thread_num) + ".dat", "a+")
    for (idx, url) in urls:
        print(thread_num, idx, url)
        try:
            html = urllib.urlopen(url).read().decode("utf8")
            text = get_text(html)
        except Exception as e:
            print("Error: " + str(e))
            url_text.append((idx, ""))
            continue

        # break into lines and remove leading and trailing space on each
        lines = (line.strip() for line in text.splitlines())
        # break multi-headlines into a line each
        chunks = (phrase.strip() for line in lines
                  for phrase in line.split("  "))
        # drop blank lines
        chunks = [chunk for chunk in chunks if chunk]
        chunks = filter(lambda line: len(line.split(" ")) >= 8, chunks)
        #Remove lines with html tags
        chunks = filter(lambda word: "<" not in word, chunks)
        #Remove alphanumeric characters and then remove lines with html tags
        chunks = map(lambda word: non_alpha_regex.sub(" ", word), chunks)
        thread_file.write("|".join([str(idx), "\n".join(chunks)]))
        thread_file.write("##")
    thread_file.close()
Exemple #19
0
    def get_content(self, html_name, mode):
        f = open(html_name)
        url = f.readline()[:-1]
        if (os.fstat(f.fileno()).st_size > self.max_size):
            print('B')
            f.close()
            return url, [[], []]

        html_rest = f.read()

        if mode == 'BS':
            soup = BeautifulSoup(html_rest, "html.parser")
            [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])]
            html_text = soup.get_text()
            headers = self.get_headers(html_rest)
            nolem = self.clean(html_text)
            lem = self.lemmatize(nolem)
            val = [lem, headers]

        else:
            extractor = Extractor(extractor='DefaultExtractor', html=html_rest)
            html_text = extractor.getText()
            nolem = self.clean(html_text)
            lem = self.lemmatize(nolem)
            if len(lem) < self.min_lems:
                html_text = get_text(html_rest)
                html_text = self.get_outer_fields(html_text)
                nolem = self.clean(html_text)
                lem = self.lemmatize(nolem)
            val = [lem]

        f.close()
        return url, val
Exemple #20
0
def run_inscriptis(htmlstring):
    '''try with the inscriptis module'''
    try:
        text = get_text(htmlstring)
    except TypeError:
        text = ''
    return text  # sanitize(text)
Exemple #21
0
def get_web_text(url):

    print("getting web text for %s" % url)

    try:

        hdr = {
            'User-Agent':
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
            'Accept-Encoding': 'none',
            'Accept-Language': 'en-US,en;q=0.8',
            'Connection': 'keep-alive'
        }

        req = urllib.request.Request(url)
        req.add_header(
            'User-Agent',
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
        )

        html = urllib.request.urlopen(req).read().decode('utf-8')
        result = get_text(html)

        result = re.sub('\n', ' ', result)
        result = re.sub(' +', ' ', result)
        return result

    except Exception as e:
        print(e)
        return None
 def parse(self, response):
     page = response.url.split("/")[-2]
     filename = Constants.base_file + '\\blogs\\Page-%s.txt' % page
     resp = response.css("body").extract()
     output = get_text(resp[0])
     with open(filename, 'wb') as f:
         f.write(output.encode())
    def parse(self, response):
        # """
        # The lines below is a spider contract. For more info see:
        # http://doc.scrapy.org/en/latest/topics/contracts.html

        # @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
        # @scrapes name
        # """
        # sel = Selector(response)
        # sites = sel.xpath('//ul[@class="directory-url"]/li')
        # items = []
        item = Website()
        markup = response.xpath('/html').extract()
        regex = re.compile(r'[\n\r\t]')
        content = get_text(regex.sub(" ", markup[0]))
        
        item["url"] = response.request.url
        item["snapshot"] = {}

        item["snapshot"]["response_url"] = response.url
        item["snapshot"]["status"] = response.status
        item["snapshot"]["title"] = response.xpath('/html/head/title/text()').extract_first()
        item["snapshot"]["content"] = content
        item["snapshot"]["timestamp"] = datetime.datetime.now().timestamp()
        
        return item
Exemple #24
0
def full_html_content(url, user_agent):
    try:
        html = create_requests(url, user_agent)
        text = get_text(html)
        return text
    except Exception as ex:
        print(ex)
        return ""
Exemple #25
0
def scrap_it(url_name):
    try:
        url = url_name
        html = urllib.request.urlopen(url).read().decode('utf-8')
        return ' '.join(get_text(html).split())
    except ConnectionError:
        raise Exception('check your network connection!!')
    except:
        raise Exception('invalid url !')
    def _parse_html(doc_body):
        title = re.search(TITLE_MATCH, doc_body)
        if title:
            title = title.group(1)
        else:
            title = ''

        doc_body = doc_body.replace('<', ' <').replace('>', '> ')
        return '{}{}{}'.format(title, TITLE_SEP, get_text(doc_body))
Exemple #27
0
def Points():
    link = requests.get("http://mydiba.link")
    url = link.text
    soup = BeautifulSoup(url, "html.parser")
    link_movie_pints = soup.find_all('div' , class_="post-infos-index")
    for point in link_movie_pints:
        point = str(point)
        urls_point = get_text(point)
        sleep(1)
        print("About the movie\n\n","IMdb",urls_point,"\n\n")
Exemple #28
0
 def sabamovie_get_title(PageName, Domain):
     url = Domain
     Url_Request = get(url).text
     Soup = BeautifulSoup(Url_Request, "lxml")
     Soup_Find = Soup.find_all("h2", class_="title-text")
     Sting_soup = get_text(str(Soup_Find))
     Sting_soup = Sting_soup.replace("[", "")
     Sting_soup = Sting_soup.replace("]", "")
     Sting_soup = Sting_soup.replace(",", "")
     bot.send_message(message.chat.id, Sting_soup)
Exemple #29
0
def test_display_links():
    html = '''<html>
                 <body>
                   <a href="first">first</a>
                   <a href="second">second</a>
                 </body>
                </html>
            '''
    assert get_text(html, display_links=True).strip() == \
        '[first](first) [second](second)'
Exemple #30
0
def Browser_artist():
    """    serach name artist  """
    browser_artist_char = args.ser
    url_radio_javan = "https://www.radiojavan.com/mp3s/browse/artists/"+browser_artist_char
    url_Browsers = requests.get(url_radio_javan).text
    soup = BeautifulSoup(url_Browsers, "lxml")
    data_Browser = soup.find_all("span", class_="artist")
    for item_browser in data_Browser:
        get_txt = get_text(str(item_browser))
        print(get_txt)
Exemple #31
0
def book_read(book_name):
    book = open_book(str(book_name))
    lines = convert_epub_to_lines(book)
    print(len(lines))
    s = ("\n".join(lines))
    #text=re.sub("<[^>]*>","",s
    text = get_text(s)
    text = re.sub("[ ]+", " ", text)
    text = re.sub("[\n]+", "\n", text)
    return text
def test_html_snippets(filter_str=''):
    for testcase_txt in glob(TESTCASE_PATTERN):
        if not filter_str in testcase_txt:
            continue

        with open(testcase_txt) as f:
            reference_txt = f.read().strip()

        with open(testcase_txt.replace(".txt", ".html")) as f:
            html = u"<html><body>{}</body></html>".format(f.read())

        converted_txt = get_text(html).strip()

        if converted_txt != reference_txt:
            print (u"File:{}\nHTML:\n{}\n\nReference:\n{}\n\nConverted:\n{}".format(testcase_txt, html, reference_txt, converted_txt))

        assert converted_txt == reference_txt
Exemple #33
0
def test_successive_a():
    html = u'<html><body><a href="first">first</a><a href="second">second</a></body></html>'
    assert get_text(html) == 'firstsecond'

    html = u'<html><body><a href="first">first</a>\n<a href="second">second</a></body></html>'
    assert get_text(html) == 'first second'
Exemple #34
0
    """ Parses the arguments if script is run directly via console """
    parser = argparse.ArgumentParser(description='Converts HTML from file or url to a clean text version')
    parser.add_argument('input', help='Html input either from a file or an url')
    parser.add_argument('-o', '--output', type=str, help='Output file (default:stdout).')
    parser.add_argument('-e', '--encoding', type=str, help='Content encoding for files (default:utf-8)', default='utf-8')
    parser.add_argument('-i', '--image-captions', action='store_true', default=False, help='Display image captions (default:false).')
    parser.add_argument('-d', '--deduplicate-image-captions', action='store_true', default=False, help='Deduplicate image captions (default:false).')
    args = parser.parse_args()

    return args


if __name__ == "__main__":
    args = get_args()

    if args.input.startswith("http://") or args.input.startswith("https://"):
        html_content = urlopen(args.input)
    else:
        with open(args.input, encoding=args.encoding) as f:
            html_content = f.read()

    text = get_text(html_content,
                    display_images=args.image_captions,
                    deduplicate_captions=args.deduplicate_image_captions)
    if args.output:
        with open(args.output, 'w') as open_file:
            open_file.write(text)
    else:
        print(text.encode("utf-8"))

Exemple #35
0
def pipeline():
    run_lynx = True
    run_justext = True
    run_html2text = True
    run_beautifulsoup = True
    run_inscriptis = True

    # These are a few predefined urls the script will
    sources = []
    with open(os.path.join(benchmarking_root, 'url_list.txt')) as url_list:
        for line in url_list:
            sources.append(line.strip())

    if not os.path.exists(benchmarking_results_dir):
        os.makedirs(benchmarking_results_dir)

    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)

    with open(os.path.join(benchmarking_results_dir, 'speed_comparisons.txt'), 'w') as output_file:
            output_file.write(u"")

    for source in sources:

        source_name = clean_source_name(source)
        source_cache_path= os.path.join(cache_dir, source_name)
        if os.path.exists(source_cache_path):
            html = open(source_cache_path).read()
        else:
            try:
                html = urllib.request.urlopen(source).read().decode("utf-8")
            except UnicodeDecodeError:
                html = urllib.request.urlopen(source).read().decode("latin1")
            open(source_cache_path, 'w').write(html)

        with open(os.path.join(benchmarking_results_dir, 'speed_comparisons.txt'), 'a') as output_file:
            output_file.write(u"\nURL: {}\n".format(source_name))
        print("\nURL: {}".format(source_name))

        times = {}

        if run_lynx and lynx_available:
            algorithm = "lynx"
            start_time = time.time()
            for n in range(TRIES):
                data = get_output_lynx(html)
            stop_time = time.time()
            times[algorithm] = stop_time - start_time
            save_to_file(algorithm, source_name, data)

        if run_justext and justext_available:
            algorithm = "justext"
            start_time = time.time()
            for n in range(TRIES):
                data = get_output_justext(html)
            stop_time = time.time()
            times[algorithm] = stop_time - start_time
            save_to_file(algorithm, source_name, data)

        if run_html2text and html2text_available:
            algorithm = "html2text"
            start_time = time.time()
            for n in range(TRIES):
                data = get_output_html2text(html)
            stop_time = time.time()
            times[algorithm] = stop_time - start_time
            save_to_file(algorithm, source_name, data)

        if run_beautifulsoup:
            algorithm = "beautifulsoup"
            start_time = time.time()
            for n in range(TRIES):
                data = get_output_beautifulsoup(html)
            stop_time = time.time()
            times[algorithm] = stop_time - start_time
            save_to_file(algorithm, source_name, data)

        if run_inscriptis:
            algorithm = "inscriptis"
            start_time = time.time()
            for n in range(TRIES):
                data = inscriptis.get_text(html)
            stop_time = time.time()
            times[algorithm] = stop_time - start_time
            save_to_file(algorithm, source_name, data)

        speed_table = get_speed_table(times)
        print(speed_table)

        with open(os.path.join(benchmarking_results_dir, 'speed_comparisons.txt'), 'a') as output_file:
            output_file.write(speed_table + u"\n")
    with open(os.path.join(benchmarking_results_dir, 'speed_comparisons.txt'), 'a') as output_file:
        output_file.write(u"\n")