def multi_threading_scrapping(url_list): """ The scrapping function that will iterate through all the list of given urls. Parameters: url_list (List): List of urls. Returns: Void function """ for link in url_list: url = link.get('href') downloaded = trafilatura.fetch_url(url) # Checking the content of page. if trafilatura.extract(downloaded) != None: # Make a GET request to fetch the raw HTML content html_content = requests.get(url).text # Parse the html content soup = BeautifulSoup(html_content, "lxml") temp.append(url) # Complete the dictionary. my_dict['Url'].append(url) my_dict['Title'].append(soup.title.text) my_dict['Content'].append(trafilatura.extract(downloaded)) my_dict['Score'].append(0) # all scores are initialized at 0.
def get_text_content(id_, mime_type, content, trafilatura_options=None): if trafilatura_options is None: trafilatura_options = {} if is_plain_text_mime_type(mime_type): return content.decode('utf-8') elif is_html_like_mime_type(mime_type): return trafilatura.extract(content, **trafilatura_options) else: logging.warning(f'unexpected MIME type {mime_type} for {id_}') # try anyway return trafilatura.extract(content, **trafilatura_options)
def get_page_text(url): downloaded = trafilatura.fetch_url(url=url) h = html2text.HTML2Text() h.ignore_links = True extracted_data = trafilatura.extract(downloaded) if extracted_data is not None: page_text_output = h.handle(extracted_data).replace('\n', ' ').replace(' ', ' ').strip() print('page_text_output len:', len(page_text_output)) return h.handle(trafilatura.extract(downloaded)).replace('\n', ' ').replace(' ', ' ').strip() else: return ''
def GetDocContent(topic_id, uuid, index='cw12'): url = baseUrl + '/cache?uuid={}&index={}&raw&plain'.format(uuid, index) # g = requests.get(url) 5e733d53-43e8-58f0-abfe-fa7fc2538733 source_file = trafilatura.fetch_url(url) # g.text if not source_file: print('Cannot retrieve document {}'.format(uuid)) time.sleep(0.5) return ' ', ' ' # return GetDocContent(topic_id, uuid, index) print('Document has been retrieved succesfully {}'.format(uuid)) # Extract content using boilerpy3 and trafilatura, then combine results data_1 = trafilatura.extract(source_file) if data_1: data_1 = TAG_RE.sub('', data_1) doc_1 = nlp(data_1) sents_1 = [sent.text.strip().lower().replace('\n', ' ') for sent in doc_1.sents if len(sent.text) > 20] else: sents_1 = [] data_2 = extractor.get_content(source_file) if data_2: data_2 = TAG_RE.sub('', data_2) doc_2 = nlp(data_2) sents_2 = [sent.text.strip().lower().replace('\n', ' ') for sent in doc_2.sents if len(sent.text) > 20] else: sents_2 = [] final_data = list(set(sents_1) | set(sents_2)) main_content = '\n'.join(final_data) return source_file, main_content
def get_main_text_html(page): html_page = html.fromstring(str(page)) extracted = trafilatura.extract(html_page) if extracted: return extracted print("Using default page text") return page.text
def get_full_text_col(df: pd.DataFrame) -> None: """ A method that will add a new column "full_text" in which the full text of the URL associated to the row will be there. It will also save the new DataFrame in a csv file. :param df: A DataFrame that contains a url column :return: None """ full_text_list = [] for url in tqdm(df['url']): try: content = trafilatura.fetch_url(url) full_text_list.append( trafilatura.extract(content, include_comments=False, include_tables=False, no_fallback=False)) except Exception as e: print(e) full_text_list.append(np.nan) time.sleep(random.uniform(0.1, 1)) print("Finished") df['full_text'] = full_text_list df.to_csv('../../data/raw/raw_data_facts_full_text.csv', index=False)
def get_zdi_text_from_url(url): headers = { 'Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Accept': '*/*', 'Sec-Fetch-Site': 'cross-site', 'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Dest': 'empty', 'Accept-Language': 'en-US,en;q=0.9,ru;q=0.8', } text = trafilatura.extract(requests.get(url, headers=headers).text) new_text = "" for line in text.split("\n"): skip = False if re.findall("^\|", line): # remove tables skip = True if re.findall("^- ", line): line = re.sub("^- ", "", line) if not skip: if not re.findall( "\.$", line ) and "CVE-" in line: # looks like a header; don't add new line new_text += line + ". " else: new_text += re.sub("\.$", ". ", line) + "\n" # normal line return (new_text)
def html_extract_body_teixml(doc: bytes) -> dict: try: tei_xml = trafilatura.extract( doc, output_format="xmltei", include_comments=False, include_formatting=True, ) except (ValueError, TypeError, Exception) as e: return dict( status="trafilatura-parse-error", error_msg=str(e)[:1000], ) if tei_xml: body_txt = teixml_body_text(tei_xml) word_count = len(body_txt.split()) return dict(status="success", agent=TRAFILATURA_AGENT, tei_xml=tei_xml, word_count=word_count) elif doc.startswith( b'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd">' ): # hack for firstmonday.org return html_extract_body_teixml(doc[106:]) else: return dict(status="empty-xml", agent=TRAFILATURA_AGENT)
def import_from_url(self, author_id, url): """import_from_url is used to import memos from external sources in internet. Args: author_id: memo's author's id as ObjectId. url: source location where imported content is currently as string. Returns: Union(Memo, None): returns Memo if saved successfully, returns None if not or content from website were empty. """ try: imported = fetch_url(url) content = extract(imported, include_comments=False, include_tables=True, output_format="txt") if not content: raise ValueError index = content.find('\n') while index != -1: content = content[:index] + "\n" + content[index:] index = content.find('\n', index + 2) url_i = (url.find("://"), url.find("/", url.find("://") + 3)) title = "Imported from " + url[url_i[0] + 3:url_i[1]] if len( url[url_i[0] + 3:url_i[1]]) < 36 else url[url_i[0] + 3:36] return self.create(author_id, title, content) except ValueError: return None
def run_trafilatura_fallback(htmlstring): '''run trafilatura (with fallback) on content''' result = extract(htmlstring, no_fallback=False, include_comments=False, include_tables=True) return result
def scrape_page(url: str) -> None: """ A method that will be scrape the articles of a given url and save it as a csv file. Particularly from 'https://www.bbc.com/mundo/search?q=<TERM_TO_SEARCH>' :param url: The url of the search engine :return: None """ term_to_search = url.split('=')[-1] s = HTMLSession() # # First search articles_pages, next_page = get_articles_pages(url, s) idx = 1 with open(f'../../data/raw/bbc_articles_{term_to_search}.csv', 'w', newline='') as csvfile: fieldnames = [ 'id', 'url', 'author', 'date', 'description', 'sitename', 'title', 'text', 'categoria' ] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() while next_page: for page in articles_pages: print(f'url:', page) time.sleep(random.uniform(1, 2)) try: content = trafilatura.fetch_url(page) article = trafilatura.metadata.extract_metadata(content) article['text'] = trafilatura.extract( content, include_comments=False, include_tables=False, no_fallback=False) writer.writerow({ 'id': idx, 'url': article['url'], 'author': article['author'], 'date': article['date'], 'description': article['description'], 'sitename': article['sitename'], 'title': article['title'], 'text': article['text'], 'categoria': 'confiable' }) except Exception as e: print("Failed to get content", e) idx += 1 print('=' * 50) print("NEXT:", next_page) try: articles_pages, next_page = get_articles_pages(next_page, s) except Exception as e: print("Failed to get new search page", e) time.sleep(random.uniform(10, 15)) print("Finished")
def extract_body(url): #html = trafilatura.fetch_url(url) html = requests.get(url, headers=HDRS).text text = trafilatura.extract(html, include_comments=False) soup = BeautifulSoup(html, features="lxml") title = soup.title.string.split(' - ')[0] # try to strip away website name text = title + '.\n\n' + (text or '') # say the title at the beginning return title, text
def __main_content_extraction(self): try: text = trafilatura.extract(self.html, include_comments=False) except TypeError: # library appears to be buggy text = None if text: self.text = NEWLINE_REGEX.sub('\n\n', text) self.np_article.set_text(text)
def crawl_extracted_content(url, user_agent): try: html = fetch_url(url, user_agent) extract_text = trafilatura.extract(html) return extract_text except Exception as ex: print(html) print(ex) return ""
def run_trafilatura(htmlstring): '''run trafilatura (without fallback) on content''' return extract( htmlstring, no_fallback=True, include_comments=False, include_tables=True, include_formatting=False, )
def run_trafilatura_recall(htmlstring): '''run trafilatura with preference for recall''' return extract( htmlstring, no_fallback=False, favor_recall=True, include_comments=False, include_tables=True, include_formatting=False, )
def url2sentences(url): # 데이터를 크롤링하고 문장 단위로 나누어 주는 함수 downloaded = trafilatura.fetch_url(url) result = trafilatura.extract(downloaded) web_doclist = result # 크롤링한 데이터 받아오는 부분 (text density 적용) sentences = re.sub('[-=.#/?:$}]', '', web_doclist) # 정규표현식으로 필요없는 문자 제거 sentences = sentences.split() # 문자열을 리스트로 변환 for idx in range(0, len(sentences)): if len(sentences[idx]) <= 10: sentences[idx - 1] += (' ' + sentences[idx]) sentences[idx] = '' return sentences
def Extract_Contents(clean_links): list2 = [] for url in clean_links: downloaded = trafilatura.fetch_url(url) trafilatura.extract(downloaded) # outputs main content and comments as plain text ... list1 = trafilatura.extract(downloaded, include_comments=False) # outputs main content without comments as XML ... list2.append("\n") list2.append( "---------------------------------------------------------------------------------------------------------------------" ) list2.append("\n") list2.append("Below contents are extracted from this url:") list2.append("\n") list2.append(url) list2.append("\n") list2.append(list1) list3 = ''.join(filter(None, list2)) return list3
def main(): output = {} for path in Path('html').glob('*.html.gz'): with gzip.open(path, 'rt', encoding='utf8') as f: html = f.read() item_id = path.stem.split('.')[0] output[item_id] = { 'articleBody': trafilatura.extract(html, include_comments=False) } (Path('output') / 'trafilatura.json').write_text(json.dumps( output, sort_keys=True, ensure_ascii=False, indent=4), encoding='utf8')
def readResults(urls, query): x = [] # Prepare the data frame to store results position = 0 # position on the serp for page in urls: # Loop items in results downloaded = trafilatura.fetch_url(page) if downloaded is not None: # assuming the download was successful result = trafilatura.extract(downloaded, include_tables=False, include_formatting=False, include_comments=False) x.append(result) return x
def extract(self, url: str, html_text: str): # don't fallback to readability/justext because we have our own hierarchy of things to try text = trafilatura.extract(html_text, no_fallback=True) self.content = { 'url': url, 'text': text, 'title': None, 'publish_date': None, 'top_image_url': None, 'authors': None, 'extraction_method': METHOD_TRIFILATURA, }
def post(self): content = request.json logging.debug(content) language = content['language'] show_explanations = 'explain' in content and content['explain'] show_highlights = 'highlights' in content and content['highlights'] disallowed_rels = content['disallowed_rels'] if 'disallowed_rels' in content else args.disallowed_rels.split(';') if 'uri' in content: downloaded = trafilatura.fetch_url(content['uri']) if downloaded is None: return jsonify({ "error": "Could not fetch URL" }) text = trafilatura.extract(downloaded) elif 'text' in content: text = content['text'] # Process text with labels... labels = content['labels'] # Process text with labels response = predict(text, labels, language, disallowed_rels, show_explanations, show_highlights) ### response looks like this ### ### (both the labels and the paths are sorted by score ### # [{'label': 'space', # 'score': 1.2366091944277287, # 'terms': [{'paths': [['space', 'label']], 'score': 1.0}, # {'paths': [['star', 'locatedat', 'space']], 'score': 0.18517242}, # {'paths': [['love', 'isa', 'television_show'], # ['television_show', 'isa', 'space']], # 'score': 0.05143677}]}, # {'label': 'technology', # 'score': 0.1451974897645414, # 'terms': [{'paths': [['space', 'relatedto', 'science_fiction'], # ['science_fiction', 'relatedto', 'technology']], # 'score': 0.14295651}, # {'paths': [['love', 'relatedto', 'technophilia'], # ['technophilia', 'relatedto', 'technology']], # 'score': 0.0022409796}]}, # {'label': 'medicine', # 'score': 0.05455923452973366, # 'terms': [{'paths': [['space', 'relatedto', 'science'], # ['science', 'relatedto', 'medicine']], # 'score': 0.054559235}]}] # Return the output as a JSON string return jsonify({ "text": text, "labels": labels, "results": response })
def get_tenable_text_from_url(url): headers = { 'Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Accept': '*/*', 'Sec-Fetch-Site': 'cross-site', 'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Dest': 'empty', 'Accept-Language': 'en-US,en;q=0.9,ru;q=0.8', } response = requests.get(url, headers=headers) return (trafilatura.extract(response.text))
def content_extract_with_t(url_list): # 포문 돌면서 리스트 하나씩 꺼내서 아래 내용 돌면서 본문 출력함 for i in range(len(url_list)): print('URL :', url_list[i]) downloaded = trafilatura.fetch_url(url_list[i]) content = trafilatura.extract(downloaded) print( "*************************************************************************************************" ) print(content) print( "*************************************************************************************************" )
def Sentiment(request): ajx = False if request.method == 'POST': if request.POST.get('text_up'): text = request.POST.get('text') elif request.POST.get('url_up'): url_up = request.POST.get('text') url = re.search("(?P<url>https?://[^\s]+)", url_up).group("url") downloaded = trafilatura.fetch_url(url) text = trafilatura.extract(downloaded) elif request.POST.get('upld'): text = upload(request.FILES['file']) if request.is_ajax(): text = request.POST.get('text') ajx = True # text = request.POST.get('text') print(text) sid = SentimentIntensityAnalyzer() #message_text = '''It seems to me we are in the middle of no man's land with respect to the following: Opec production speculation, Mid east crisis and renewed tensions, US elections and what looks like a slowing economy (?), and no real weather anywhere in the world. I think it would be most prudent to play the markets from a very flat price position and try to day trade more aggressively. I have no intentions of outguessing Mr. Greenspan, the US. electorate, the Opec ministers and their new important roles, The Israeli and Palestinian leaders, and somewhat importantly, Mother Nature. Given that, and that we cannot afford to lose any more money, and that Var seems to be a problem, let's be as flat as possible. I'm ok with spread risk (not front to backs, but commodity spreads). The morning meetings are not inspiring, and I don't have a real feel for everyone's passion with respect to the markets. As such, I'd like to ask John N. to run the morning meetings on Mon. and Wed. Thanks. Jeff''' message = text # Calling the polarity_scores method on sid and passing in the message_text outputs a dictionary with negative, neutral, positive, and compound scores for the input text scores = sid.polarity_scores(message) # Here we loop through the keys contained in scores (pos, neu, neg, and compound scores) and print the key-value pairs on the screen d = {} for key in sorted(scores): print('{0}: {1}, '.format(key, scores[key]), end='') val = round(scores[key] * 100, 2) d.update({key: val}) print(d) d.update({"flag": 1, "text": text}) print(d) if ajx: return JsonResponse(d, status=200) else: return render(request, 'Sentiment.html', context=d) else: if ajx: return JsonResponse(None, status=200) else: return render(request, 'Sentiment.html', {"message": None})
def LangTranslate(request): language = list(LANGUAGES.values()) ajx = False if request.method == 'POST': if request.POST.get('text_up'): text = request.POST.get('text') elif request.POST.get('url_up'): url_up = request.POST.get('text') url = re.search("(?P<url>https?://[^\s]+)", url_up).group("url") downloaded = trafilatura.fetch_url(url) text = trafilatura.extract(downloaded) elif request.POST.get('upld'): text = upload(request.FILES['file']) if request.is_ajax(): text = request.POST.get('text') ajx = True inputLanguage = str(request.POST.get('in_lang')).lower() outputLanguage = str(request.POST.get('out_lang')).lower() dataToTranslate = text print(inputLanguage, outputLanguage) translator = Translator(from_lang=inputLanguage, to_lang=outputLanguage) translation = translator.translate(dataToTranslate) if ajx: return JsonResponse( { 'translation': translation, 'language': language, 'text': text }, status=200) else: return render( request, 'LangTranslate.html', { 'translation': translation, 'language': language, 'text': text, 'in_lang': inputLanguage, 'out_lang': outputLanguage }) else: if ajx: return JsonResponse(None, status=200) else: return render(request, 'LangTranslate.html', {'language': language})
def get_title_text_web(url): downloaded = trafilatura.fetch_url(url) if downloaded == None: title = 'Not working text' text = 'Not working title' check = 'fake' dictio = {'title': [title], 'text': [text], 'check': check} df = pd.DataFrame(dictio, columns=['title', 'text', 'check']) return df text = trafilatura.extract(downloaded) html = request.urlopen(url).read().decode('utf8') soup = BeautifulSoup(html, 'html.parser') title = soup.find('title').string dictio = {'title': [title], 'text': [text], 'check': True} df = pd.DataFrame(dictio, columns=['title', 'text', 'check']) return df
def generate_quizzes_url(self, url, n_choices=3, window_size=400, rolling=300, n_questions=5): r = requests.get(url) encoding = chardet.detect(r.content) r.encoding = encoding content = trafilatura.extract(r.text) response = [] lasted_q = '' ith = 1 for line in content.split('\n'): if len(line) > 150 and ith <= n_questions: for i in range(0, max(1, len(line) - window_size), rolling): tmp_text = line[i:i + window_size] q, a = self.generate_quiz(tmp_text) if q and a: try: if q == lasted_q: continue x = self.generate_choices(line, a, n_choices) if len(x) != n_choices + 1: if len(x) == n_choices: x.append('ไม่มีข้อใดถูก') else: continue if len(x) == n_choices + 1: response.append( dict({ "question": q, "choices": x, "answer": a, "answer_idx": x.index(a) })) lasted_q = q ith += 1 else: continue except: pass if ith > n_questions: break return response
def detect_content_languages(id_, content): if not content: return f'SKIP:EMPTY-CONTENT' try: text_content = trafilatura.extract(content) except Exception as e: logging.error(f'failed extract for {id_}: {e}') return f'SKIP:EXTRACT-ERROR: {e}' if not text_content: return f'SKIP:EMPTY-TEXT' try: langs = detect_langs(text_content) except Exception as e: logging.error(f'failed langdetect for {id_}: {e}') return f'SKIP:LANGDETECT-ERROR: {e}' return langs
def test_html(): import requests import xml.dom.minidom import logging #logging.getLogger("trafilatura").setLevel(logging.FATAL) #response = requests.get("https://devmuaz.medium.com/flutter-clean-architecture-series-part-1-d2d4c2e75c47") #response = requests.get("https://www.theguardian.com/environment/2021/apr/28/speed-at-which-worlds-glaciers-are-melting-has-doubled-in-20-years") response = requests.get("https://dmitryelj.medium.com/howto-using-a-pager-in-the-21st-century-6a57454ecde8") result = trafilatura.extract(response.text, include_formatting=True, with_metadata=True, output_format="html", include_images=True, include_links=True) #print(result) dom = xml.dom.minidom.parseString(result) pretty_xml_as_string = dom.toprettyxml() print(pretty_xml_as_string)