def index(): ''' Return the probability for each genre. ''' if not (request.query.text or request.query.url): return 'invoke with ?text= or ?url=' if request.query.text: art = article.Article(text=request.query.text) elif request.query.url: art = article.Article(url=request.query.url) example = [art.features[f] for f in utilities.features] abs_path = os.path.dirname(os.path.realpath(__file__)) clf = joblib.load(abs_path + os.sep + 'model.pkl') proba = clf.predict_proba([example])[0] resp = {} for i, p in enumerate(proba): resp[utilities.genres[i + 1][0].split('/')[0]] = str(proba[i])[:6] resp = json.dumps(resp) if request.query.callback: resp = request.query.callback + '(' + resp + ')' return resp
def __update_wiki(self, link_list): """" Gets a list of tuples with articles and links and builds the network by the given values """ for tuple_art in link_list: if tuple_art[0] not in self.__articles_dict: self.__articles_dict[tuple_art[0]] = ar.Article(tuple_art[0]) if tuple_art[1] not in self.__articles_dict: self.__articles_dict[tuple_art[1]] = ar.Article(tuple_art[1]) self.__articles_dict[tuple_art[0]].add_neighbor( self.__articles_dict[tuple_art[1]])
def scrape(link): soup = BeautifulSoup(requests.get(link).content, 'html.parser') [s.extract() for s in soup('script')] # entfernt alle script tags # HEADLINE headline = soup.find('h1').string # TOPIC topic = '' if len(soup.find_all('div', class_='category')) > 0: topic = soup.find_all('div', class_='category')[0].find('span').get('span') # AUTHOR author = '' if len(soup.find_all('span', class_='author')) > 0: author = soup.find_all('span', class_='author')[0].get_text() # TEXT_BODY text_body = soup.find_all('article', 'fullarticle')[0].get_text() text_body = ' '.join(text_body.split()) # CREATION_DATE creation_date = '' if soup.find('time'): creation_date = soup.find('time').get('datetime') return article.Article(headline, link, text_body, 'http://www.german-times.com', 'german-times', author, topic, date.today(), creation_date)
def scrape(link): soup = BeautifulSoup(requests.get(link).content, 'html.parser') [s.extract() for s in soup('script')] # entfernt alle script tags # HEADLINE headline = soup.find('h1', class_='entry-title title post_title').string # TOPIC topic = soup.find('span', class_='article_dots cat').string # AUTHOR author = soup.find('div', class_='von').contents[2][1:] # TEXT_BODY text_body = soup.find( 'div', 'post_content_inner_wrapper content_inner_wrapper entry-content' ).get_text() # CREATION_DATE creation_date = soup.find('div', class_='von').find('span', class_='article_dots').string return article.Article(headline, link, text_body, 'https://www.theeuropean.de', 'theeuropean', author, topic, date.today(), creation_date)
def predict(input_dir): ''' Get genre probabilities for each text document in input directory. ''' clf = joblib.load('model.pkl') with open('results.csv', 'wb') as fh: writer = csv.writer(fh, delimiter='\t') writer.writerow(['Filename'] + [utilities.genres[g][0].split('/')[0] for g in utilities.genres]) for filename in [f for f in os.listdir(input_dir) if f.endswith('.txt')]: with open(input_dir + os.sep + filename) as ifh: print('Processing file: ' + filename) row = [] row.append(filename) # Read input file doc = ifh.read().decode('utf-8') # Create article object and calculate features art = article.Article(text=doc) features = [art.features[f] for f in utilities.features] # Get probability for each genre proba = clf.predict_proba([features])[0] # Save results for g in utilities.genres: row.append(str(proba[g - 1])[:6]) writer.writerow(row) print(row[1:])
def scrape(link): soup = BeautifulSoup(requests.get(link).content, 'html.parser') [s.extract() for s in soup('script')] # entfernt alle script tags # HEADLINE headline = soup.find('h1').string # TOPIC topic = '' if len(soup.find_all('span', class_='item-containers')) > 0: topic = soup.find_all('span', class_='item-containers')[0].find('a').get('a') topic = "" if topic is None else topic # AUTHOR author = '' if len(soup.find_all('span', class_='source')) > 0: author = soup.find_all('span', class_='source')[0].get_text() # TEXT_BODY if len(soup.find_all('div', 'art-text-inner')) > 0: text_body = soup.find_all('div', 'art-text-inner')[0].get_text() text_body = ' '.join(text_body.split()) else: text_body = '' # CREATION_DATE creation_date = '' if soup.find('time'): creation_date = soup.find('time').get('datetime') return article.Article(headline.strip(), link, text_body, 'https://www.wprost.pl', 'wprost', author.strip(), topic.strip(), date.today(), creation_date)
def test_parse_smh_article(self): with open('testdata/smh_article.html', 'r') as f: html = f.read() a = article.Article(None, None, 'Sydney Morning Herald') a.parse(html) print(a.text)
def articleFromResult(item): try: articleDate = dateutil.parser.parse( item['pagemap']['metatags'][0]['article:published_time']).strftime( '%m-%d-%Y') except: articleDate = "" return article.Article(item['title'], item['link'], articleDate, item['snippet'])
def get_next_article(self): if not self.buffer: self._add_to_buffer() try: entry = self.buffer.pop(0) except IndexError: # we have gone through the entire file return None return article.Article(id_=entry['id'], title=entry['title'], text=entry['text'])
def get_next_article(self): self.cursor.execute('SELECT title, text FROM {0} WHERE id={1}' \ .format(self.table, self.current_pos)) articles = self.cursor.fetchall() if not articles or len(articles) != 1: return None art = article.Article(id_=self.current_pos, title=articles[0][0], text=articles[0][1]) self.current_pos += 1 return art
def test_parse_bbc_article(self): with open('testdata/bbc_article.html', 'r') as f: html = f.read() a = article.Article(None, None, 'BBC') a.parse(html) # Make sure that some text was found self.assertGreater(len(a.text), 500) # Make sure the ads / JS functions are removed self.assertNotIn('/**/', a.text)
def marketwatch(comp_n, comp_t): URL = 'https://www.marketwatch.com/trading-deck/stories' page = requests.get(URL) soup = BeautifulSoup(page.content, 'html.parser') results = soup.find('ol', class_='headlines block') if None is results: exit comp_name = comp_n comp_tick = comp_t article_elems = results.find_all('li') for article_elem in article_elems: if None is article_elem: continue URL = 'https://www.marketwatch.com' + article_elem.find( 'a')['href'].strip() page = requests.get(URL) soup = BeautifulSoup(page.content, 'html.parser') title_elem = article_elem.find('a', class_='bighead') results = soup.find(id='article-body') if None is results: continue para_elems = results.find_all('p') relevant_article = False if (comp_name or comp_tick) in title_elem.text.strip(): relevant_article = True else: for para_elem in para_elems: if None is para_elem: continue if (comp_name or comp_tick) in para_elem.text.strip(): relevant_article = True continue if relevant_article: body_builder = '' for para_elem in para_elems: if None is para_elem: continue body_builder += para_elem.text.strip() + ' ' body_builder = ' '.join(body_builder.split()) new_article = article.Article(title_elem.text.strip(), URL, body_builder.strip()) print(new_article, end='\n' * 2)
def handler(title, string): a = article.Article(title) a.fetch() if a.has_error(): response.status = 400 return count = a.get_string_count_in_text(string) response.headers['Content-Type'] = 'application/json' response.status = 200 return json.dumps({'count': count})
def seekingalpha(comp_n, comp_t): URL = 'https://seekingalpha.com/market-news' page = requests.get(URL) soup = BeautifulSoup(page.content, 'html.parser') results = soup.find(id='latest-news-list') if None is results: exit comp_name = comp_n comp_tick = comp_t article_elems = results.find_all('li', class_='item') for article_elem in article_elems: if None is article_elem: continue URL = 'https://seekingalpha.com' + article_elem.find( 'a')['href'].strip() page = requests.get(URL) soup = BeautifulSoup(page.content, 'html.parser') title_elem = article_elem.find('h4') results = soup.find(id='bullets_ul') if None is results: continue bullet_elems = results.find_all('p', class_='bullets_li') relevant_article = False if (comp_name or comp_tick) in title_elem.text.strip(): relevant_article = True else: for bullet_elem in bullet_elems: if None is bullet_elem: continue if (comp_name or comp_tick) in bullet_elem.text.strip(): relevant_article = True continue if relevant_article: body_builder = '' for bullet_elem in bullet_elems: if None is bullet_elem: continue body_builder += bullet_elem.text.strip() + ' ' new_article = article.Article(title_elem.text.strip(), URL, body_builder.strip()) print(new_article, end='\n' * 2)
def scrape(link): soup = BeautifulSoup(requests.get(link).content, 'html.parser') [s.extract() for s in soup('script')] # entfernt alle script tags # HEADLINE headline = soup.find('h1').string # TOPIC topic = '' if soup.find("a", class_="navigation-wide-list__link navigation-arrow--open"): menuActive = soup.find( "a", class_="navigation-wide-list__link navigation-arrow--open") topic = menuActive.find("span").get_text() # AUTHOR author = '' if soup.find('span', class_='byline__name'): author = soup.find('span', class_='byline__name').get_text() # TEXT_BODY if soup.find('div', class_='story-body__inner'): innerArticle = soup.find('div', class_='story-body__inner') elif soup.find('div', class_='vxp-media__summary'): innerArticle = soup.find('div', class_='vxp-media__summary') else: print("no content found on" + link) return pList = innerArticle.find_all('p') text_body = '' for p in pList: text_body += p.get_text() + ' ' # CREATION_DATE creation_date = '' #soup.find('div', class_='date date--v2 relative-time').get('data-datetime') if soup.find('div', class_='date date--v2 relative-time'): timeStamp = soup.find( 'div', class_='date date--v2 relative-time').get('data-seconds') creation_date = datetime.fromtimestamp(timeStamp, timezone.utc) #creation_date = datetime.fromtimestamp(timeStamp).strftime("%A, %B %d, %Y %I:%M:%S") # CRAWL_DATE crawl_date = datetime.now() return article.Article(headline, link, text_body, 'https://www.bbc.com', 'bbc', author.replace("By ", ""), topic, crawl_date, creation_date)
def __init__(self, parsed, args): """create feed with fixed number of articles """ logging.info('Started creting feed') articles_list = [] cashed_news_number = 0 if args.date: logging.info('Started extracting data from cache') self.link = args.source self.feed_name = f'Feeds from {args.source}' with shelve.open('cashed_feeds') as database: if not database: raise ex.EmptyDataBase('Local feed storage is empty') for date in database: if args.date in date and database[ date].feed_link == args.source: articles_list.append(database[date]) cashed_news_number += 1 if cashed_news_number == 0: raise ex.DateNotInDatabase( 'There is no feeds with this date and source in local storage' ) logging.info('Finished extracting data from cache') if args.limit: if args.limit > cashed_news_number and args.date: print(f'Only {cashed_news_number} feeds cashed') number_of_articles = cashed_news_number elif not args.date and args.limit > len(parsed.entries): print(f'Only {len(parsed.entries)} feeds cashed') number_of_articles = len(parsed.entries) else: number_of_articles = args.limit articles_list = articles_list[:number_of_articles] else: if args.limit: if args.limit > len(parsed.entries): print(f'Only {len(parsed.entries)} feeds avaliable') number_of_articles = len(parsed.entries) else: number_of_articles = args.limit else: number_of_articles = len(parsed.entries) for i in range(number_of_articles): articles_list.append( article.Article(parsed.entries[i], args.source)) self.feed_name = make_string_readable(parsed.feed.title) self.link = parsed.feed.link self.articles = articles_list
def update_network(self, link_list): """ Adds any links or articles not currently in the network. :param link_list: :return: """ for link in link_list: for title in link: if title not in self._articles: self._articles[title] = article.Article(title) if self._articles[link[1]] not in self._articles[link[0]]: self._articles[link[0]].\ add_neighbor(self._articles[link[1]])
def get_articles(): print("Getting articles from: " + source_url) html = make_request() soup = BeautifulSoup(html, "html.parser") lis = soup.find_all("ul", class_="reactiesList") arts = [] for eles in lis: art = article.Article() art.title = eles.a.text art.source = "voetbalzone.nl" art.url = eles.a.get('href') art.full_url = root_url + art.url arts.append(art) return arts
def generate_training(self, path): ''' Generate training data from a list of labeled articles. ''' with open(path, 'rU') as fh: db = csv.DictReader(fh, delimiter='\t') with open('data/training.txt', 'wb') as fh: fieldnames = ['url', 'label'] + utilities.features writer = csv.DictWriter(fh, fieldnames=fieldnames, delimiter='\t') writer.writeheader() for i, row in enumerate(db): # Get url url = None if row['Identifier']: url = row['Identifier'] elif (row['Prediction'] != 'None' and float(row['Confidence']) > 0.675): url = row['Prediction'] else: continue if not url.endswith(':ocr'): url += ':ocr' # Get label label = None for g in utilities.genres: if row['Genre'] in utilities.genres[g]: label = g break if not label: continue # If valid training instance found, create new article try: art = article.Article(url=url) # Save results fields = {'label': label, 'url': url} for f in utilities.features: fields[f] = art.features[f] writer.writerow(fields) except (IOError, AssertionError) as e: print('Error processsing article ' + url + ': ' + repr(e))
def arxiv(ax_id): ''' Ask for arXiv identifier and return corresponding Article class. ''' # python 3 truncates leading zeros but these might occur in arxiv identifiers # TODO: check! ax_id = str(ax_id).zfill(9) article_year = get_year(ax_id) abs_url = 'https://arxiv.org/abs/{}'.format(ax_id) src_abs = requests.get(abs_url) # obtain a _structured_ document ("tree") of source of abs_url page_tree = html.fromstring(src_abs.content) # extract title and abstract from page tree title = ' '.join( page_tree.xpath('//meta[@name="citation_title"]/@content')) abstract = ' '.join( page_tree.xpath('//meta[@property="og:description"]/@content')) # get main subject from page tree main_subject = page_tree.xpath( '//span [@class="primary-subject"]')[0].text_content() # first get all authors (formate compatible with bibtex) all_authors = page_tree.xpath('//meta[@name="citation_author"]/@content') if len(all_authors) > 1: authors_name = ' and '.join(all_authors) else: authors_name = all_authors[0] # second create a short and 'contracted' authors' name, e.g. to create file name or bibtex key authors_short_list = [a.split(', ')[0] for a in all_authors[:3]] if len(all_authors) > 3: authors_short = authors_short_list[0] + ' et al' authors_contracted = authors_short_list[0] + 'EtAl' elif 1 < len(all_authors) <= 3: authors_short = ', '.join(authors_short_list[:-1]) authors_short += ' and ' + authors_short_list[-1] authors_contracted = ''.join(authors_short_list) else: authors_short = authors_short_list[0] # TODO: IMPROVE!?!? authors_contracted = authors_short return article.Article(title=title, authors=authors_name, authors_short=authors_short, authors_contracted=authors_contracted, abstract=abstract, ax_id=ax_id, year=article_year, main_subject=main_subject)
def get_articles(): print("Getting articles from: " + source_url) html = make_request() soup = BeautifulSoup(html, "html.parser") lis = soup.find_all("div", class_="item") arts = [] for eles in lis: print() art = article.Article() art.title = eles.a.get("title") art.url = eles.a.get("href") art.full_url = root_url + art.url art.source = source_url arts.append(art) return arts
def get_article_list(): url = "http://www.chong4.com.cn" response = urllib.request.urlopen(url) line = response.read() soup = BeautifulSoup(line, "html.parser") nodes = soup.select(".textbox-title a") for node in nodes: ar = article.Article() ar.title = node.string ar.link = url + node.get("href") ar.id = ar.link.split("?")[1] re.hmset(ar.id, ar.__dict__) get_article_detail(ar.link, ar.id) re.save() return "ok"
def scrape(link): soup = BeautifulSoup(requests.get(link).content, 'html.parser') [s.extract() for s in soup('script')] # entfernt alle script tags # CREATION_DATE creation_date = '' if soup.find( 'span', class_='stand') is None else soup.find('span', class_='stand').string # HEADLINE dachzeile = '' if soup.find('span', class_='dachzeile') is None else soup.find( 'span', class_='dachzeile').string title = '' if soup.find('span', class_='headline') is None else soup.find( 'span', class_='headline').string headline = dachzeile + ' - ' + title # TOPIC topic = '' if len(link.split("/")) < 3 else link.split("/")[3] # AUTHOR author = '' if soup.find('p', class_='autorenzeile') is None else soup.find( 'p', class_='autorenzeile').string author = '' if author is None else author.replace("Von ", "") author = '' if author is None else author.replace(",", "") # TEXT_BODY text_body = '' text_body_tag = soup.find_all('p', 'text') for ptag in text_body_tag: text_body = text_body + ptag.get_text() if text_body is not '': text_body = ' '.join(text_body.split()) text_body = text_body.replace( creation_date, "") # entfernt den Zeitstempel aus dem Text # CLEAN TIME creation_date = creation_date.replace("Stand: ", "") creation_date = creation_date.replace(" Uhr", "") return article.Article(headline, link, text_body, 'https://www.tagesschau.de', 'tagesschau', author, topic, date.today(), creation_date)
def __get_all_article(self): articles = [] links = list(map(lambda x: re.sub("http://\\w+\\.10jqka.com\\.cn/", self.__m_url, x.get('href')), self.__soup.select(".arc-title > a"))) i = 0 while i < len(links): r = requests.get(links[i], headers=headers, allow_redirects=False) if r.status_code == 403: print("Too fast, Forbidden!") time.sleep(1234) continue try: articles.append(article.Article(self.type, r.content, r.encoding)) except IndexError: pass finally: time.sleep(random.uniform(0.57, 1.08)) i += 1 return articles
def _init_article_(self, next_file, article_meta, body): id_ = self._get_ids(article_meta) pmc_tag = self._get_PMC_ids(article_meta) pmid = self._get_ids(article_meta) title = self._get_title(article_meta) try: temp = article_meta.find('abstract') if (temp is None): abstract = [] else: abstract_sections = self._get_sections(temp) abstract = [] for part in abstract_sections: abstract.append([part[0], part[1]]) except: lop = article_meta.find('abstract').findall('p') abstract = reduce( (lambda x, y: ''.join([x, ET.tostring(y).decode('utf-8')])), lop, "") if abstract == '': abstract = ET.tostring( article_meta.find('abstract')).decode('utf-8') if not (body is None): text = self._get_sections(body) #self._get_full_text(body) text.insert(0, ['Abstract', abstract]) else: text = [['Abstract', abstract]] # store the path of this file art = article.Article(id_=id_, title=title, text=text) art.get_extra()['path'] = next_file art.get_extra()['PMC'] = pmc_tag text.insert(1, [ "Title", [['Article Title', title], ['PubMed Id', pmid], ['PMC', pmc_tag]] ]) # only get the abstract if the next_file is None or it doesn't exist if (not (abstract is None) and not (next_file is None)): art.get_extra()['abstract'] = abstract # add the abstract in return art
def test_article_serialization(self): # Given name1 = 'mlotek' name2 = 'hammer' obj = article.Article('1', [name1, name2], 2, 2, True) expected = ''' { "id": "1", "is_available": true, "name": ["mlotek", "hammer"], "quantity": 2, "total_quantity": 2 } ''' # When actual = str(obj) # Then self.assertEqual(''.join(expected.split()), ''.join(actual.split()))
def test_article_serialization2(self): # Given name1 = '' name2 = 'hammer' obj = article.Article('4', [name1, name2], 6, 5, False) expected = ''' { "id": "4", "is_available": false, "name": ["", "hammer"], "quantity": 5, "total_quantity": 6 } ''' # When actual = str(obj) # Then self.assertEqual(''.join(expected.split()), ''.join(actual.split()))
def scrape(link): soup = BeautifulSoup(requests.get(link).content, 'html.parser') [s.extract() for s in soup('script')] # entfernt alle script tags # HEADLINE headline = '' hl = soup.find('div', class_='artTitle') if hl is not None: headline = hl.get_text().strip() else: print(link) # TOPIC topic = '' t = soup.find("div", class_="sciezka") if t is not None: topic = t.findAll("a")[1].get_text() # AUTHOR author = '' # CREATION_DATE creation_date = '' d = soup.find('div', class_='artDate') if d is not None: creation_date = d.get_text().strip() # TEXT_BODY text_body = '' tb = soup.find('div', class_='artFull') if tb is None: print(link) for div in tb.find_all('div'): div.clear() text_body = tb.get_text().strip() return article.Article(headline, link, text_body, 'http://www.warsawvoice.pl', 'warsawvoice', author, topic, date.today(), creation_date)
def scrape(link): soup = BeautifulSoup(requests.get(link).content, 'html.parser') [s.extract() for s in soup('script')] # entfernt alle script tags # HEADLINE headline = soup.find('h1').string #headline = soup.find('div', class_='title').string # TOPIC topic = '' #if len(soup.find_all('div', class_='breadcrumbs')) > 0: #topic = soup.find_all('div', class_='breadcrumbs')[0].find_all('a')[1].get('title') #topic = soup.find("a", class_="active").get_text() # AUTHOR author = '' if soup.find('meta', itemprop='name'): author = soup.find('meta', itemprop='name').get('content') # TEXT_BODY text_body = soup.find_all('div', 'gl_plugin article')[0].get_text() text_body = ' '.join(text_body.split()) # CREATION_DATE creation_date = '' #if soup.find('time'): # creation_date = soup.find('time').get('datetime') if soup.find('meta', itemprop='datePublished'): creation_date = soup.find('meta', itemprop='datePublished').get('content') # CRAWL_DATE crawl_date = datetime.now() return article.Article(headline, link, text_body, 'https://www.se.pl', 'super-express', author, topic, crawl_date, creation_date)
def _read_rss(self): """ Fetch and parse the site's RSS page Returns a list of articles to check """ d = feedparser.parse(self.rss_url) data = { 'title': d.feed.get("title"), 'published': d.feed.get("published_parsed"), 'updated': d.feed.get("updated_parsed") } # Create articles from the RSS feed for entry in d.entries: article_data = { 'published_time': entry['published_parsed'], 'title': entry['title'], 'summary': entry['summary'], 'url': entry['link'], 'host_site': self.name } a = article.Article(**article_data) self.articles.append(a)