def multi_threading_scrapping(url_list):
    """
    The scrapping function that will iterate through all the list of given urls.

    Parameters:
    url_list (List): List of urls.

    Returns:
    Void function

    """
    for link in url_list:
        url = link.get('href')
        downloaded = trafilatura.fetch_url(url)

        # Checking the content of page.
        if trafilatura.extract(downloaded) != None:

            # Make a GET request to fetch the raw HTML content
            html_content = requests.get(url).text

            # Parse the html content
            soup = BeautifulSoup(html_content, "lxml")

            temp.append(url)

            # Complete the dictionary.
            my_dict['Url'].append(url)
            my_dict['Title'].append(soup.title.text)
            my_dict['Content'].append(trafilatura.extract(downloaded))
            my_dict['Score'].append(0)  # all scores are initialized at 0.
Esempio n. 2
0
def get_text_content(id_, mime_type, content, trafilatura_options=None):
    if trafilatura_options is None:
        trafilatura_options = {}
    if is_plain_text_mime_type(mime_type):
        return content.decode('utf-8')
    elif is_html_like_mime_type(mime_type):
        return trafilatura.extract(content, **trafilatura_options)
    else:
        logging.warning(f'unexpected MIME type {mime_type} for {id_}')
        # try anyway
        return trafilatura.extract(content, **trafilatura_options)
def get_page_text(url):
    downloaded = trafilatura.fetch_url(url=url)
    h = html2text.HTML2Text()
    h.ignore_links = True
    extracted_data = trafilatura.extract(downloaded)
    if extracted_data is not None:
        page_text_output = h.handle(extracted_data).replace('\n', ' ').replace('  ', ' ').strip()
        print('page_text_output len:', len(page_text_output))
        return h.handle(trafilatura.extract(downloaded)).replace('\n', ' ').replace('  ', ' ').strip()
    else:
        return ''
Esempio n. 4
0
def GetDocContent(topic_id, uuid, index='cw12'):
    url = baseUrl + '/cache?uuid={}&index={}&raw&plain'.format(uuid, index)
    # g = requests.get(url) 5e733d53-43e8-58f0-abfe-fa7fc2538733
    source_file = trafilatura.fetch_url(url) # g.text

    if not source_file:
        print('Cannot retrieve document {}'.format(uuid))
        time.sleep(0.5)
        return ' ', ' '
        # return GetDocContent(topic_id, uuid, index)

    print('Document has been retrieved succesfully {}'.format(uuid))

    # Extract content using boilerpy3 and trafilatura, then combine results
    data_1 = trafilatura.extract(source_file)
    if data_1:
        data_1 = TAG_RE.sub('', data_1)
        doc_1 = nlp(data_1)
        sents_1 = [sent.text.strip().lower().replace('\n', ' ') for sent in doc_1.sents if len(sent.text) > 20]
    else:
        sents_1 = []

    data_2 = extractor.get_content(source_file)
    if data_2:
        data_2 = TAG_RE.sub('', data_2)
        doc_2 = nlp(data_2)
        sents_2 = [sent.text.strip().lower().replace('\n', ' ') for sent in doc_2.sents if len(sent.text) > 20]
    else:
        sents_2 = []

    final_data = list(set(sents_1) | set(sents_2))
    main_content = '\n'.join(final_data)

    return source_file, main_content
def get_main_text_html(page):
    html_page = html.fromstring(str(page))
    extracted = trafilatura.extract(html_page)
    if extracted:
        return extracted
    print("Using default page text")
    return page.text
Esempio n. 6
0
def get_full_text_col(df: pd.DataFrame) -> None:
    """
    A method that will add a new column "full_text"
     in which the full text of the URL associated to the row will be there. It will also
     save the new DataFrame in a csv file.

    :param df: A DataFrame that contains a url column
    :return: None
    """
    full_text_list = []
    for url in tqdm(df['url']):
        try:
            content = trafilatura.fetch_url(url)
            full_text_list.append(
                trafilatura.extract(content,
                                    include_comments=False,
                                    include_tables=False,
                                    no_fallback=False))
        except Exception as e:
            print(e)
            full_text_list.append(np.nan)
        time.sleep(random.uniform(0.1, 1))
    print("Finished")
    df['full_text'] = full_text_list
    df.to_csv('../../data/raw/raw_data_facts_full_text.csv', index=False)
def get_zdi_text_from_url(url):
    headers = {
        'Connection': 'keep-alive',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'Accept': '*/*',
        'Sec-Fetch-Site': 'cross-site',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Dest': 'empty',
        'Accept-Language': 'en-US,en;q=0.9,ru;q=0.8',
    }
    text = trafilatura.extract(requests.get(url, headers=headers).text)

    new_text = ""
    for line in text.split("\n"):
        skip = False
        if re.findall("^\|", line):  # remove tables
            skip = True
        if re.findall("^- ", line):
            line = re.sub("^- ", "", line)
        if not skip:
            if not re.findall(
                    "\.$", line
            ) and "CVE-" in line:  # looks like a header; don't add new line
                new_text += line + ". "
            else:
                new_text += re.sub("\.$", ". ", line) + "\n"  # normal line

    return (new_text)
Esempio n. 8
0
def html_extract_body_teixml(doc: bytes) -> dict:
    try:
        tei_xml = trafilatura.extract(
            doc,
            output_format="xmltei",
            include_comments=False,
            include_formatting=True,
        )
    except (ValueError, TypeError, Exception) as e:
        return dict(
            status="trafilatura-parse-error",
            error_msg=str(e)[:1000],
        )
    if tei_xml:
        body_txt = teixml_body_text(tei_xml)
        word_count = len(body_txt.split())
        return dict(status="success",
                    agent=TRAFILATURA_AGENT,
                    tei_xml=tei_xml,
                    word_count=word_count)
    elif doc.startswith(
            b'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd">'
    ):
        # hack for firstmonday.org
        return html_extract_body_teixml(doc[106:])
    else:
        return dict(status="empty-xml", agent=TRAFILATURA_AGENT)
Esempio n. 9
0
    def import_from_url(self, author_id, url):
        """import_from_url is used to import memos from external sources in internet.

        Args:
            author_id: memo's author's id as ObjectId.
            url: source location where imported content is currently as string.

        Returns:
            Union(Memo, None): returns Memo if saved successfully, returns None if not or
                               content from website were empty.
        """
        try:
            imported = fetch_url(url)
            content = extract(imported,
                              include_comments=False,
                              include_tables=True,
                              output_format="txt")
            if not content:
                raise ValueError

            index = content.find('\n')
            while index != -1:
                content = content[:index] + "\n" + content[index:]
                index = content.find('\n', index + 2)
            url_i = (url.find("://"), url.find("/", url.find("://") + 3))
            title = "Imported from " + url[url_i[0] + 3:url_i[1]] if len(
                url[url_i[0] + 3:url_i[1]]) < 36 else url[url_i[0] + 3:36]
            return self.create(author_id, title, content)
        except ValueError:
            return None
Esempio n. 10
0
def run_trafilatura_fallback(htmlstring):
    '''run trafilatura (with fallback) on content'''
    result = extract(htmlstring,
                     no_fallback=False,
                     include_comments=False,
                     include_tables=True)
    return result
Esempio n. 11
0
def scrape_page(url: str) -> None:
    """
    A method that will be scrape the articles of a given url and save it as a csv file.
    Particularly from 'https://www.bbc.com/mundo/search?q=<TERM_TO_SEARCH>'
    :param url: The url of the search engine
    :return: None
    """
    term_to_search = url.split('=')[-1]
    s = HTMLSession()
    # # First search
    articles_pages, next_page = get_articles_pages(url, s)
    idx = 1

    with open(f'../../data/raw/bbc_articles_{term_to_search}.csv',
              'w',
              newline='') as csvfile:
        fieldnames = [
            'id', 'url', 'author', 'date', 'description', 'sitename', 'title',
            'text', 'categoria'
        ]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        while next_page:
            for page in articles_pages:
                print(f'url:', page)
                time.sleep(random.uniform(1, 2))
                try:
                    content = trafilatura.fetch_url(page)
                    article = trafilatura.metadata.extract_metadata(content)
                    article['text'] = trafilatura.extract(
                        content,
                        include_comments=False,
                        include_tables=False,
                        no_fallback=False)

                    writer.writerow({
                        'id': idx,
                        'url': article['url'],
                        'author': article['author'],
                        'date': article['date'],
                        'description': article['description'],
                        'sitename': article['sitename'],
                        'title': article['title'],
                        'text': article['text'],
                        'categoria': 'confiable'
                    })

                except Exception as e:
                    print("Failed to get content", e)
                idx += 1
            print('=' * 50)
            print("NEXT:", next_page)
            try:
                articles_pages, next_page = get_articles_pages(next_page, s)
            except Exception as e:
                print("Failed to get new search page", e)
            time.sleep(random.uniform(10, 15))
    print("Finished")
Esempio n. 12
0
def extract_body(url):
    #html = trafilatura.fetch_url(url)
    html = requests.get(url, headers=HDRS).text
    text = trafilatura.extract(html, include_comments=False)
    soup = BeautifulSoup(html, features="lxml")
    title = soup.title.string.split(' - ')[0]  # try to strip away website name
    text = title + '.\n\n' + (text or '')  # say the title at the beginning
    return title, text
Esempio n. 13
0
 def __main_content_extraction(self):
     try:
         text = trafilatura.extract(self.html, include_comments=False)
     except TypeError: # library appears to be buggy
         text = None
     if text:
         self.text = NEWLINE_REGEX.sub('\n\n', text)
         self.np_article.set_text(text)
Esempio n. 14
0
def crawl_extracted_content(url, user_agent):
    try:
        html = fetch_url(url, user_agent)
        extract_text = trafilatura.extract(html)
        return extract_text
    except Exception as ex:
        print(html)
        print(ex)
        return ""
Esempio n. 15
0
def run_trafilatura(htmlstring):
    '''run trafilatura (without fallback) on content'''
    return extract(
        htmlstring,
        no_fallback=True,
        include_comments=False,
        include_tables=True,
        include_formatting=False,
    )
Esempio n. 16
0
def run_trafilatura_recall(htmlstring):
    '''run trafilatura with preference for recall'''
    return extract(
        htmlstring,
        no_fallback=False,
        favor_recall=True,
        include_comments=False,
        include_tables=True,
        include_formatting=False,
    )
Esempio n. 17
0
def url2sentences(url):  # 데이터를 크롤링하고 문장 단위로 나누어 주는 함수
    downloaded = trafilatura.fetch_url(url)
    result = trafilatura.extract(downloaded)
    web_doclist = result  # 크롤링한 데이터 받아오는 부분 (text density 적용)
    sentences = re.sub('[-=.#/?:$}]', '', web_doclist)  # 정규표현식으로 필요없는 문자 제거
    sentences = sentences.split()  # 문자열을 리스트로 변환
    for idx in range(0, len(sentences)):
        if len(sentences[idx]) <= 10:
            sentences[idx - 1] += (' ' + sentences[idx])
    sentences[idx] = ''
    return sentences
def Extract_Contents(clean_links):
    list2 = []
    for url in clean_links:
        downloaded = trafilatura.fetch_url(url)
        trafilatura.extract(downloaded)
        # outputs main content and comments as plain text ...
        list1 = trafilatura.extract(downloaded, include_comments=False)
        # outputs main content without comments as XML ...
        list2.append("\n")
        list2.append(
            "---------------------------------------------------------------------------------------------------------------------"
        )
        list2.append("\n")
        list2.append("Below contents are extracted from this url:")
        list2.append("\n")
        list2.append(url)
        list2.append("\n")
        list2.append(list1)
        list3 = ''.join(filter(None, list2))
    return list3
def main():
    output = {}
    for path in Path('html').glob('*.html.gz'):
        with gzip.open(path, 'rt', encoding='utf8') as f:
            html = f.read()
        item_id = path.stem.split('.')[0]
        output[item_id] = {
            'articleBody': trafilatura.extract(html, include_comments=False)
        }
    (Path('output') / 'trafilatura.json').write_text(json.dumps(
        output, sort_keys=True, ensure_ascii=False, indent=4),
                                                     encoding='utf8')
Esempio n. 20
0
def readResults(urls, query):
    x = []  # Prepare the data frame to store results
    position = 0  # position on the serp
    for page in urls:  # Loop items in results
        downloaded = trafilatura.fetch_url(page)
        if downloaded is not None:  # assuming the download was successful
            result = trafilatura.extract(downloaded,
                                         include_tables=False,
                                         include_formatting=False,
                                         include_comments=False)
            x.append(result)
            return x
Esempio n. 21
0
 def extract(self, url: str, html_text: str):
     # don't fallback to readability/justext because we have our own hierarchy of things to try
     text = trafilatura.extract(html_text, no_fallback=True)
     self.content = {
         'url': url,
         'text': text,
         'title': None,
         'publish_date': None,
         'top_image_url': None,
         'authors': None,
         'extraction_method': METHOD_TRIFILATURA,
     }
Esempio n. 22
0
    def post(self):
        content = request.json
        logging.debug(content)

        language = content['language']
        show_explanations = 'explain' in content and content['explain']
        show_highlights = 'highlights' in content and content['highlights']
        disallowed_rels = content['disallowed_rels'] if 'disallowed_rels' in content else args.disallowed_rels.split(';')

        if 'uri' in content:
            downloaded = trafilatura.fetch_url(content['uri'])
            if downloaded is None:
                return jsonify({ "error": "Could not fetch URL" })
            text = trafilatura.extract(downloaded)
        elif 'text' in content:
            text = content['text']

        # Process text with labels...
        labels = content['labels']

        # Process text with labels
        response = predict(text, labels, language, disallowed_rels, show_explanations, show_highlights)

        ### response looks like this ###
        ### (both the labels and the paths are sorted by score ###
        # [{'label': 'space',
        #   'score': 1.2366091944277287,
        #   'terms': [{'paths': [['space', 'label']], 'score': 1.0},
        #    {'paths': [['star', 'locatedat', 'space']], 'score': 0.18517242},
        #    {'paths': [['love', 'isa', 'television_show'],
        #      ['television_show', 'isa', 'space']],
        #     'score': 0.05143677}]},
        #  {'label': 'technology',
        #   'score': 0.1451974897645414,
        #   'terms': [{'paths': [['space', 'relatedto', 'science_fiction'],
        #      ['science_fiction', 'relatedto', 'technology']],
        #     'score': 0.14295651},
        #    {'paths': [['love', 'relatedto', 'technophilia'],
        #      ['technophilia', 'relatedto', 'technology']],
        #     'score': 0.0022409796}]},
        #  {'label': 'medicine',
        #   'score': 0.05455923452973366,
        #   'terms': [{'paths': [['space', 'relatedto', 'science'],
        #      ['science', 'relatedto', 'medicine']],
        #     'score': 0.054559235}]}]

        # Return the output as a JSON string
        return jsonify({
            "text": text,
            "labels": labels,
            "results": response
        })
def get_tenable_text_from_url(url):
    headers = {
        'Connection': 'keep-alive',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'Accept': '*/*',
        'Sec-Fetch-Site': 'cross-site',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Dest': 'empty',
        'Accept-Language': 'en-US,en;q=0.9,ru;q=0.8',
    }
    response = requests.get(url, headers=headers)
    return (trafilatura.extract(response.text))
Esempio n. 24
0
def content_extract_with_t(url_list):
    # 포문 돌면서 리스트 하나씩 꺼내서 아래 내용 돌면서 본문 출력함

    for i in range(len(url_list)):
        print('URL :', url_list[i])
        downloaded = trafilatura.fetch_url(url_list[i])
        content = trafilatura.extract(downloaded)

        print(
            "*************************************************************************************************"
        )
        print(content)
        print(
            "*************************************************************************************************"
        )
Esempio n. 25
0
def Sentiment(request):
    ajx = False
    if request.method == 'POST':
        if request.POST.get('text_up'):
            text = request.POST.get('text')
        elif request.POST.get('url_up'):
            url_up = request.POST.get('text')
            url = re.search("(?P<url>https?://[^\s]+)", url_up).group("url")
            downloaded = trafilatura.fetch_url(url)
            text = trafilatura.extract(downloaded)

        elif request.POST.get('upld'):
            text = upload(request.FILES['file'])

        if request.is_ajax():
            text = request.POST.get('text')
            ajx = True
        # text = request.POST.get('text')
        print(text)
        sid = SentimentIntensityAnalyzer()

        #message_text = '''It seems to me we are in the middle of no man's land with respect to the  following:  Opec production speculation, Mid east crisis and renewed  tensions, US elections and what looks like a slowing economy (?), and no real weather anywhere in the world. I think it would be most prudent to play  the markets from a very flat price position and try to day trade more aggressively. I have no intentions of outguessing Mr. Greenspan, the US. electorate, the Opec ministers and their new important roles, The Israeli and Palestinian leaders, and somewhat importantly, Mother Nature.  Given that, and that we cannot afford to lose any more money, and that Var seems to be a problem, let's be as flat as possible. I'm ok with spread risk  (not front to backs, but commodity spreads). The morning meetings are not inspiring, and I don't have a real feel for  everyone's passion with respect to the markets.  As such, I'd like to ask  John N. to run the morning meetings on Mon. and Wed.  Thanks. Jeff'''

        message = text

        # Calling the polarity_scores method on sid and passing in the message_text outputs a dictionary with negative, neutral, positive, and compound scores for the input text
        scores = sid.polarity_scores(message)

        # Here we loop through the keys contained in scores (pos, neu, neg, and compound scores) and print the key-value pairs on the screen
        d = {}
        for key in sorted(scores):
            print('{0}: {1}, '.format(key, scores[key]), end='')
            val = round(scores[key] * 100, 2)
            d.update({key: val})
        print(d)

        d.update({"flag": 1, "text": text})
        print(d)

        if ajx:
            return JsonResponse(d, status=200)
        else:
            return render(request, 'Sentiment.html', context=d)
    else:
        if ajx:
            return JsonResponse(None, status=200)
        else:
            return render(request, 'Sentiment.html', {"message": None})
Esempio n. 26
0
def LangTranslate(request):
    language = list(LANGUAGES.values())
    ajx = False
    if request.method == 'POST':
        if request.POST.get('text_up'):
            text = request.POST.get('text')
        elif request.POST.get('url_up'):
            url_up = request.POST.get('text')
            url = re.search("(?P<url>https?://[^\s]+)", url_up).group("url")
            downloaded = trafilatura.fetch_url(url)
            text = trafilatura.extract(downloaded)

        elif request.POST.get('upld'):
            text = upload(request.FILES['file'])
        if request.is_ajax():
            text = request.POST.get('text')
            ajx = True

        inputLanguage = str(request.POST.get('in_lang')).lower()
        outputLanguage = str(request.POST.get('out_lang')).lower()
        dataToTranslate = text
        print(inputLanguage, outputLanguage)
        translator = Translator(from_lang=inputLanguage,
                                to_lang=outputLanguage)
        translation = translator.translate(dataToTranslate)
        if ajx:
            return JsonResponse(
                {
                    'translation': translation,
                    'language': language,
                    'text': text
                },
                status=200)
        else:
            return render(
                request, 'LangTranslate.html', {
                    'translation': translation,
                    'language': language,
                    'text': text,
                    'in_lang': inputLanguage,
                    'out_lang': outputLanguage
                })
    else:
        if ajx:
            return JsonResponse(None, status=200)
        else:
            return render(request, 'LangTranslate.html',
                          {'language': language})
Esempio n. 27
0
def get_title_text_web(url):
    downloaded = trafilatura.fetch_url(url)
    if downloaded == None:
        title = 'Not working text'
        text = 'Not working title'
        check = 'fake'
        dictio = {'title': [title], 'text': [text], 'check': check}
        df = pd.DataFrame(dictio, columns=['title', 'text', 'check'])
        return df
    text = trafilatura.extract(downloaded)
    html = request.urlopen(url).read().decode('utf8')
    soup = BeautifulSoup(html, 'html.parser')
    title = soup.find('title').string
    dictio = {'title': [title], 'text': [text], 'check': True}
    df = pd.DataFrame(dictio, columns=['title', 'text', 'check'])
    return df
Esempio n. 28
0
    def generate_quizzes_url(self,
                             url,
                             n_choices=3,
                             window_size=400,
                             rolling=300,
                             n_questions=5):
        r = requests.get(url)
        encoding = chardet.detect(r.content)
        r.encoding = encoding
        content = trafilatura.extract(r.text)
        response = []

        lasted_q = ''
        ith = 1
        for line in content.split('\n'):
            if len(line) > 150 and ith <= n_questions:
                for i in range(0, max(1, len(line) - window_size), rolling):
                    tmp_text = line[i:i + window_size]
                    q, a = self.generate_quiz(tmp_text)
                    if q and a:
                        try:
                            if q == lasted_q:
                                continue
                            x = self.generate_choices(line, a, n_choices)
                            if len(x) != n_choices + 1:
                                if len(x) == n_choices:
                                    x.append('ไม่มีข้อใดถูก')
                                else:
                                    continue

                            if len(x) == n_choices + 1:
                                response.append(
                                    dict({
                                        "question": q,
                                        "choices": x,
                                        "answer": a,
                                        "answer_idx": x.index(a)
                                    }))
                                lasted_q = q
                                ith += 1
                            else:
                                continue
                        except:
                            pass
                    if ith > n_questions:
                        break
        return response
Esempio n. 29
0
def detect_content_languages(id_, content):
    if not content:
        return f'SKIP:EMPTY-CONTENT'
    try:
        text_content = trafilatura.extract(content)
    except Exception as e:
        logging.error(f'failed extract for {id_}: {e}')
        return f'SKIP:EXTRACT-ERROR: {e}'

    if not text_content:
        return f'SKIP:EMPTY-TEXT'
    try:
        langs = detect_langs(text_content)
    except Exception as e:
        logging.error(f'failed langdetect for {id_}: {e}')
        return f'SKIP:LANGDETECT-ERROR: {e}'
    return langs
Esempio n. 30
0
def test_html():
    import requests
    import xml.dom.minidom
    import logging
    #logging.getLogger("trafilatura").setLevel(logging.FATAL)

    #response = requests.get("https://devmuaz.medium.com/flutter-clean-architecture-series-part-1-d2d4c2e75c47")
    #response = requests.get("https://www.theguardian.com/environment/2021/apr/28/speed-at-which-worlds-glaciers-are-melting-has-doubled-in-20-years")
    response = requests.get("https://dmitryelj.medium.com/howto-using-a-pager-in-the-21st-century-6a57454ecde8")

    result = trafilatura.extract(response.text, include_formatting=True, with_metadata=True, output_format="html",
                                 include_images=True, include_links=True)
    #print(result)

    dom = xml.dom.minidom.parseString(result)
    pretty_xml_as_string = dom.toprettyxml()
    print(pretty_xml_as_string)