def articleDateExtractorFunc(urlParam): old_stdout = sys.stdout # backup current stdout sys.stdout = open(os.devnull, "w") date = articleDateExtractor.extractArticlePublishedDate(urlParam) sys.stdout.close() sys.stdout = old_stdout # reset old stdout return date
def Parser( url ): search = 'https://api.aylien.com/api/v1/extract?url=' + url headers = { "X-AYLIEN-TextAPI-Application-ID":"f94984be", "X-AYLIEN-TextAPI-Application-Key":"83a7b904239577d9967e5402c461f388" } req = requests.get(url = search, headers=headers) data = req.json() date = articleDateExtractor.extractArticlePublishedDate(url) formattedDate = date if( date != None ): formattedDate = str(date).replace("-", "") formattedDate = formattedDate[:-9] formattedDate = int(formattedDate) parsed = { 'title': data['title'], 'author': data['author'], 'article': data['article'], 'date': formattedDate } return parsed
def run_articledateextractor(htmlstring): '''try with articleDateExtractor''' dateresult = extractArticlePublishedDate('', html=htmlstring) if dateresult is None: return None date = convert_date(dateresult, '%Y-%m-%d %H:%M:%S', '%Y-%m-%d') return date
def parseURL(self, url): # create Newspaper3k object news3 = NewsArticle(url) # download and parse the article try: news3.download() news3.parse() except: self.logerror(url, "Download Failed") # protect from failure with a try except return # check date / end pub = news3.publish_date # try to get the date from Newspaper3k access = datetime.today() # we accessed this rn # second date attempt if first failed if (pub == None): pub = getdate.extractArticlePublishedDate(url) # we need the date so if we cant find it we simply throw this article away if (pub == None): self.logerror(url, "Date Detection Failed - Missing") return # sometimes the date thing gets the wrong date - WACK # but to be smart we'll throw anything out that is outside # of our search range and pray to Jesus that the rest are accurate pub = str_to_datetime(pub) if (pub < str_to_datetime(self.search_after) or pub > str_to_datetime(self.search_before)): self.logerror(url, "Date Detection Failed - Out of Bounds") return # TextBlob Analysis textBlobObj = TextBlob(news3.text, analyzer = self.analyzer) # create textBlob object #textBlobTwo = TextBlob(news3.text, analyzer = self.analyzer) # language / end try: lang = textBlobObj.detect_language() if (lang != 'en'): self.logerror(url, "Non English Article") return # we only want to deal with English articles except: self.logerror(url, "Language Detection Failed") return # have TextBlob calculate sentiment try: sentiment = textBlobObj.sentiment except Exception as e: print(e) self.logerror(url, "Sentiment Analysis Failed") return # Add to article list this_article = Article(url, sentiment.classification, sentiment.p_pos, sentiment.p_neg, news3.title, pub, access, ' + '.join(news3.authors), news3.text) this_article.output(self.output_filename) #output to files this_article.output(GABE_GLOBAL_OUTPUT) # output again self.articles.append(this_article) # add to our list
def postprocessing(self, handle=True): if handle: handl = urlhandlers.handle_by_url(self) self.mass_media = handl['izd_name'] self.publish_date = handl['pubdate'] self.text = handl['news_text'] self.title = handl['news_title'] self.fulltext = self.text if not FULL_TEXT: self.paragraphs = self.text.split('\n') self.text = self.paragraphs[0] if not self.publish_date: self.publish_date = articleDateExtractor.extractArticlePublishedDate( self.url, self.html) if not isinstance(self.publish_date, str) and self.publish_date: publish_date = self.publish_date self.publish_date = str(publish_date.day) + '.0' + str( publish_date.month) + '.' + str(publish_date.year) if not self.mass_media: clean_path = urlsplit(self.url) base_url = clean_path.netloc self.mass_media = base_url.replace('www.', '')
def get_date( self, list_of_links ): #<get_date> function extracts the publishing date of the news articles dates = [] for link in list_of_links: date = articleDateExtractor.extractArticlePublishedDate(link) dates.append(date) return dates
def get_date(url): try: article_date = articleDateExtractor.extractArticlePublishedDate(url) article_date = article_date.strftime('%Y-%m-%d') except BaseException as error: print('error: {}'.format(error)) article_date = 'error: {}'.format(error) return article_date
def getdate(self, params): #parse pdate pdate = "" try: docrsp = doc(params['html'], params['url']) pubdate = articleDateExtractor.extractArticlePublishedDate( params['url'], docrsp.html()) if pubdate: pdate = ValidateTime(int(time.mktime(pubdate.timetuple()))) except Exception as e: pass return pdate
def parse_item(self, response): try: g = Goose() article = g.extract(raw_html=response.body) item = NeocrawlItem() item['totalnews'] = response.body item['title'] = article.title item['url'] = response.url item['date'] = articleDateExtractor.extractArticlePublishedDate( response.body) item['meta'] = article.meta_description item['newsdesc'] = article.cleaned_text item['tokens'] = nltk.word_tokenize(article.cleaned_text) yield item except Exception: pass
def ArticleDateParser( paras ): html = paras['html'] url = paras['url'] parserTable = {} item = {} try: item['parser'] = 'ArticleDate' docrsp = doc(html,url) pubdate = articleDateExtractor.extractArticlePublishedDate(url, docrsp.html()) if pubdate: item['pdate'] = int(time.mktime(pubdate.timetuple())) item['pdate'] = ValidateTime( item['pdate'] ) return item except Exception , e: print e return item
def ArticleDateParser(paras): html = paras['html'] url = paras['url'] parserTable = {} item = {} try: item['parser'] = 'ArticleDate' docrsp = doc(html, url) pubdate = articleDateExtractor.extractArticlePublishedDate( url, docrsp.html()) if pubdate: item['pdate'] = int(time.mktime(pubdate.timetuple())) item['pdate'] = ValidateTime(item['pdate']) return item except Exception, e: print e return item
def main(): # Read from stdin readIn = read_in() # Parse into JSON data = json.loads(readIn) if ("url" not in data or "html" not in data): print("None") return url = data["url"] html = data["html"] # TODO: Temporarily re-route stdout until d is obtained and update JS reader d = articleDateExtractor.extractArticlePublishedDate(url, html) print(d)
def parse_item(self, response): # parsed_uri = urlparse(response.url) date = articleDateExtractor.extractArticlePublishedDate(response.url) if date == None: date = find_date(response.url) print "date ", str(date)[0:10] if str(date)[0:10] == DATE: new_text = extract_text(response.url) try: cur.execute( "INSERT INTO NEWS_DATA (LINK, ARTICLE_TEXT, ARTICLE_DATE, INSERTION_DATE, ARTICLE_TYPE) values (?,?,?,?,?)", (response.url, new_text, str(date)[0:10], datetime.now().strftime("%Y-%m-%d"), 'NON-RSS')) conn.commit() print "Record Added" except sqlite3.Error as e: print("Error Writing in the DB.. " + e.args[0])
def prosess_content(self, url): article = Article(url) article.download() # article.html article.parse() dbthings = db_things.DBThings() parser = Parser() if article.authors: self.authors = ','.join(map(str, article.authors)) if article.keywords: self.keywords = ','.join(map(str, article.keywords)) publish_date = articleDateExtractor.extractArticlePublishedDate( article.url) # time.sleep(5) parser = HtmlParser.from_url(url, Tokenizer('english')) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer('english') summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words('english') all_sentences = '' for sentence in summarizer(parser.document, 10): all_sentences += (sentence._text + '\n') # TODO: Pay for better license to speed up this process # time.sleep(80) # classifier = Classifier() # category = classifier.classify_news(data=all_sentences) category = 'General' if publish_date is not None: dbthings.insert_extracted(self.authors, str(publish_date), all_sentences.encode('utf-8', 'ignore'), article.top_image, self.keywords, article.url, article.title, category) return
def time_score(url): d = articleDateExtractor.extractArticlePublishedDate(url) currentDT = (datetime.now()) temp = 0.001 print d print currentDT if d.year == currentDT.year: if d.month == currentDT.month: if d.day == currentDT.day: temp = 0.4 elif abs(d.day - currentDT.day) == 1: temp = 0.3 elif abs(d.day - currentDT.day) == 2: temp = 0.2 elif abs(d.day - currentDT.day) == 3: temp = 0.1 elif abs(d.day - currentDT.day) == 4: temp = 0.05 else: temp = 0.01 print "Time score is: ", temp return temp
def addArticle(featureClassList, prefix, articleLabel, articleURL, articleObj): articleTitle = articleObj.title authors = articleObj.authors articleText = articleObj.text articleTitle = re.sub(r'[\'\,\.\"\\\/\!\@\$\%\&\*]+', '', articleTitle) filename = prefix + '_' + re.sub(r'\W+', '', articleTitle) filepath = filename[:24] + '.txt' if (articleLabel == credibleLabel): filepath = './credible/' + 'r_' + filepath else: filepath = './malicious/' + 'f_' + filepath saveArticleContents(filepath, articleText) numChar = len(articleText) numWords = len(articleText.split()) articleDate = 'NULL' d = articleDateExtractor.extractArticlePublishedDate(articleURL) if (type(d) == datetime.datetime): articleDate = d.date() featureClassList.write(articleLabel + ',' + articleURL + ',' + '"' + filepath + '"' + ',' + '"' + articleTitle + '"' + ',' + str(len(authors)) + ',' + str(numChar) + ',' + str(numWords) + ',' + articleDate + '\n') return
def Archiv_Crawler_MM(Starting_Date, Ending_Date=datetime.datetime.now()): Link_list = [] Date_list = [] Year = datetime.datetime.now().timetuple()[0] quote_page = 'http://www.manager-magazin.de/unternehmen/archiv-' + str( Year) + '999.html' Right_Arrow = 'SENSELESS CONTENT' while Right_Arrow != None: ### HTML herunterladen if Right_Arrow == 'SENSELESS CONTENT': page = urllib.request.urlopen(quote_page) else: page = urllib.request.urlopen(Right_Arrow_Link) soup = BeautifulSoup(page, 'html.parser') ### Artikel-Verlinkungen nehmen name_box = soup.find_all('h2', attrs={'class': 'article-title'}) ### Allen Artikeln auf der aktuellen Seite die Links entnehmen und in die Liste fügen for i in range(len(name_box)): if 'http' in name_box[i].find('a').get('href'): URL = name_box[i].find('a').get('href') else: URL = 'http://www.manager-magazin.de' + name_box[i].find( 'a').get('href') Timing = articleDateExtractor.extractArticlePublishedDate(URL) Time = datetime.datetime( Timing.timetuple()[0], Timing.timetuple()[1], Timing.timetuple()[2], ) if Time > Ending_Date: continue if Time >= Starting_Date: Link_list.append(URL) Date_list.append(Time) ### Den nächste Seite Button finden Right_Arrow = soup.find('a', attrs={'class': 'next'}) ### Wenn Button existiert, dann Link speichern Right_Arrow_Link = 'http://www.manager-magazin.de' + Right_Arrow.get( 'href') else: Right_Arrow = None break # man brauch nicht weiter zurück gehen quote_page = 'http://www.manager-magazin.de/finanzen/archiv-' + str( Year) + '999.html' Right_Arrow = 'SENSELESS CONTENT' while Right_Arrow != None: ### HTML herunterladen if Right_Arrow == 'SENSELESS CONTENT': page = urllib.request.urlopen(quote_page) else: page = urllib.request.urlopen(Right_Arrow_Link) soup = BeautifulSoup(page, 'html.parser') ### Artikel-Verlinkungen nehmen name_box = soup.find_all('h2', attrs={'class': 'article-title'}) ### Allen Artikeln auf der aktuellen Seite die Links entnehmen und in die Liste fügen for i in range(len(name_box)): if 'http' in name_box[i].find('a').get('href'): URL = name_box[i].find('a').get('href') else: URL = 'http://www.manager-magazin.de' + name_box[i].find( 'a').get('href') Timing = articleDateExtractor.extractArticlePublishedDate(URL) Time = datetime.datetime( Timing.timetuple()[0], Timing.timetuple()[1], Timing.timetuple()[2], ) if Time > Ending_Date: continue if Time >= Starting_Date: Link_list.append(URL) Date_list.append(Time) ### Den nächste Seite Button finden Right_Arrow = soup.find('a', attrs={'class': 'next'}) ### Wenn Button existiert, dann Link speichern Right_Arrow_Link = 'http://www.manager-magazin.de' + Right_Arrow.get( 'href') else: Right_Arrow = None break # man brauch nicht weiter zurück gehen quote_page = 'http://www.manager-magazin.de/politik/archiv-' + str( Year) + '999.html' Right_Arrow = 'SENSELESS CONTENT' while Right_Arrow != None: ### HTML herunterladen if Right_Arrow == 'SENSELESS CONTENT': page = urllib.request.urlopen(quote_page) else: page = urllib.request.urlopen(Right_Arrow_Link) soup = BeautifulSoup(page, 'html.parser') ### Artikel-Verlinkungen nehmen name_box = soup.find_all('h2', attrs={'class': 'article-title'}) ### Allen Artikeln auf der aktuellen Seite die Links entnehmen und in die Liste fügen for i in range(len(name_box)): if 'http' in name_box[i].find('a').get('href'): URL = name_box[i].find('a').get('href') else: URL = 'http://www.manager-magazin.de' + name_box[i].find( 'a').get('href') Timing = articleDateExtractor.extractArticlePublishedDate(URL) Time = datetime.datetime( Timing.timetuple()[0], Timing.timetuple()[1], Timing.timetuple()[2], ) if Time > Ending_Date: continue if Time >= Starting_Date: Link_list.append(URL) Date_list.append(Time) ### Den nächste Seite Button finden Right_Arrow = soup.find('a', attrs={'class': 'next'}) ### Wenn Button existiert, dann Link speichern Right_Arrow_Link = 'http://www.manager-magazin.de' + Right_Arrow.get( 'href') else: Right_Arrow = None break # man brauch nicht weiter zurück gehen return Link_list, Date_list
def article(text): try: try: url = text[0] article = Article(url) article.download() slept = 0 while article.download_state == ArticleDownloadState.NOT_STARTED: # Raise exception if article download state does not change after 10 seconds if slept > 9: raise ArticleException('Download never started') sleep(1) slept += 1 article.parse() article.nlp() mariadb_connectionT = mariadb.connect( host='127.0.0.1', user='******', password='******', database='condense') cursor = mariadb_connectionT.cursor() # if article.canonical_link and article.canonical_link != url: # cursor.execute("SELECT fbshares,url FROM `{!s}` where url='{!s}'".format( # domain, article.canonical_link)) # data0 = cursor.fetchone() # if data0: # cursor.execute( # "SELECT fbshares FROM `{!s}` where url='{!s}'".format(domain, url)) # data1 = cursor.fetchone() # if int(data1[0] or 0) < int(data0[0] or 0): # cursor.execute( # "delete FROM `{!s}` where url='{!s}'".format(domain, url)) # mariadb_connectionT.commit() # return # else: # cursor.execute("delete FROM `{!s}` where url='{!s}'".format( # domain, article.canonical_link)) # mariadb_connectionT.commit() # else: # cursor.execute("update `{!s}` set url='{!s}' where url='{!s}'".format( # domain, article.canonical_link, url)) # mariadb_connectionT.commit() article.nlpEntropy() keywords = article.keywords keywords = ' '.join(keywords) d = article.publish_date author = "".join(article.authors) if len(author) > 30 or not author: author = "" img = article.top_image if not d: d = articleDateExtractor.extractArticlePublishedDate( url, article.html) if not d: return cursor.execute( "UPDATE `{!s}` set isArticleData = '1', keywords = {!a}, image = {!a}, author={!a} , charCount='{:d}',wordCount='{:d}',stopWords='{:d}',titleCount='{:d}', imgCount = '{:d}', title={!a}, date='{:%Y-%m-%d}' where url='{!s}'" .format(domain, keywords, img, author, len(article.text), article.totalWords, article.stopWords, len(article.title), len(article.imgs), article.title, d, url)) mariadb_connectionT.commit() except mariadb.Error as err: print("db error", err) except ValueError as err: print("Value Error", url) print(err) except TypeError as err: print("Type Error", url) print(err) except ArticleException: print("Article exception", url) return finally: if cursor: cursor.close() mariadb_connectionT.close()
def scrapy(request): if request.method == 'POST': #if request.method == 'POST': recommendations = request.POST.getlist("Sites") print("recommendations", recommendations) pd1 = [ [], ] pd2 = [] now = datetime.datetime.now() x = datetime.datetime.now() x1 = now.year today = datetime.date.today() mon1 = x.strftime("%b") mon = mon1 + str(x1) day1 = now.day d = day1 day = str(day1) + mon1 first = today.replace(day=1) lastmonth = first - datetime.timedelta(days=1) lastmonth = lastmonth.strftime("%b") lastmonths = lastmonth + str(x1) yesderdate = datetime.datetime.strftime(x - timedelta(1), '%d') yesderdates = str(yesderdate) + lastmonth yesda = str(yesderdate) + mon1 ct = strftime("%I:%M %p") #existsm = os.path.exists(f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{mon}') existsm = os.path.exists( f'/home/admin-pc/Desktop/Article/Scrapy/{mon}') if existsm: pass else: #os.mkdir(f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{mon}') os.mkdir(f'/home/admin-pc/Desktop/Article/Scrapy/{mon}') existsm = os.path.exists( f'/home/admin-pc/Desktop/Article/Scrapy/{mon}/{day}') #existsm = os.path.exists(f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{mon}\\{day}') if existsm: pass else: #os.mkdir(f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{mon}\\{day}') os.mkdir(f'/home/admin-pc/Desktop/Article/Scrapy/{mon}/{day}') try: #for filename in os.listdir(f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{mon}\\{day}\\'): for filename in os.listdir( f'/home/admin-pc/Desktop/Article/Scrapy/{mon}/{day}/'): if existsm: if filename.endswith(".csv"): #co=pd.read_csv(f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{mon}\\{day}\\{filename}', delimiter = ',').values.tolist() co = pd.read_csv( f'/home/admin-pc/Desktop/Article/Scrapy/{mon}/{day}/{filename}', delimiter=',').values.tolist() #print("Current File",co) pd1 = pd1 + co else: pass except: pass if day1 == "1" or day1 == 1: for filename in os.listdir( f'/home/admin-pc/Desktop/Article/Scrapy/{lastmonths}/{yesderdates}/' ): #for filename in os.listdir(f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{lastmonths}\\{yesderdates}\\'): if filename.endswith(".csv"): #co1=pd.read_csv(f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{lastmonths}\\{yesderdates}\\{filename}', delimiter = ',').values.tolist() co1 = pd.read_csv( f'/home/admin-pc/Desktop/Article/Scrapy/{lastmonths}/{yesderdates}/{filename}', delimiter=',').values.tolist() pd1 = pd1 + co1 else: try: for filename in os.listdir( f'/home/admin-pc/Desktop/Article/Scrapy/{mon}/{yesda}/' ): #for filename in os.listdir(f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{mon}\\{yesda}\\'): if filename.endswith(".csv"): co1 = pd.read_csv( f'/home/admin-pc/Desktop/Article/Scrapy/{mon}/{yesda}/{filename}', delimiter=',').values.tolist() #co1=pd.read_csv(f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{mon}\\{yesda}\\{filename}', delimiter = ',').values.tolist() pd1 = pd1 + co1 #print("Old File",co1) except: pass if pd1 != [] or pd1 != "": for j in pd1: for k in j: j1 = pd2.append(k) i = 0 #list = ["https://www.engadget.com/","https://www.espn.in/","https://www.vccircle.com/","https://www.aljazeera.com/","https://www.foxnews.com/","https://edition.cnn.com/","https://www.theguardian.com/international","https://www.financialexpress.com/","https://economictimes.indiatimes.com/", "https://www.economist.com/","https://www.bbc.com/", "https://www.digitaltrends.com/","https://www.theverge.com/", "https://www.rvcj.com/" ,"https://techcrunch.com/","https://www.crictracker.com/cricket-news/","https://zeenews.india.com/","https://www.hindustantimes.com/","https://timesofindia.indiatimes.com/","https://www.timesnownews.com/","https://www.firstpost.com/tech","https://aninews.in","https://www.thehindu.com/","https://indiatoday.in","https://www.thequint.com/","https://inshorts.com/en/read","https://in.reuters.com/","https://indianexpress.com/","https://www.livemint.com/",] # list = recommendations for links in recommendations: if links != None or links != " ": response = requests.get(links) data = response.text soup = BeautifulSoup(data, "html.parser") #.encode("utf-8") print() print() print() print() if soup != None or soup != " ": if links == "https://timesofindia.indiatimes.com/": article_links = soup.findAll( 'a', attrs={'href': re.compile("/articleshow/")})[0:10] news_title = "The Times of India" elif links == "https://aninews.in": article_links = soup.findAll( 'a', attrs={'href': re.compile("/news/")})[0:10] news_title = "ANI NEWS" elif links == "https://indiatoday.in": article_links = soup.findAll( 'a', attrs={'href': re.compile("/story/")})[0:10] news_title = "India Today" elif links == "https://www.thequint.com/entertainment": article_links = soup.findAll( 'a', attrs={'href': re.compile("/entertainment/")})[0:10] news_title = "The Quint" elif links == "https://inshorts.com/en/read": article_links = soup.findAll( 'a', attrs={'href': re.compile("/news/")})[0:10] news_title = "In Shorts" elif links == "https://in.reuters.com/": article_links = soup.findAll( 'a', attrs={'href': re.compile("/article/")})[0:10] news_title = "Reuters India" elif links == "https://indianexpress.com/": article_links = soup.findAll( 'a', attrs={'href': re.compile("/article/")})[0:10] news_title = "Indian Express" elif links == "https://www.thehindu.com/": article_links = soup.findAll( 'a', attrs={'href': re.compile(".*//.*/.*/.*/.*")})[0:10] news_title = "The Hindu" elif links == "https://www.firstpost.com/tech": article_links = soup.findAll( 'a', attrs={'href': re.compile("/tech/")})[0:10] news_title = "Firstpost" elif links == "https://www.timesnownews.com/": article_links = soup.findAll( 'a', attrs={'href': re.compile("/article/")})[0:10] news_title = "Times Now" elif links == "https://www.pinkvilla.com/": article_links = soup.findAll( 'a', attrs={'href': re.compile("/entertainment/")})[0:10] news_title = "Pinkvilla" elif links == "https://www.livemint.com/": article_links = soup.findAll( 'section', attrs={'data-weburl': re.compile(".html")})[0:10] news_title = "Live Mint" elif links == "https://www.hindustantimes.com/": article_links = soup.findAll( 'a', attrs={'href': re.compile(".html")})[0:10] news_title = "Hindustan Times" elif links == "https://zeenews.india.com/": article_links = soup.findAll( 'a', attrs={'href': re.compile(".html")})[0:10] news_title = "Zee News India" elif links == "https://www.crictracker.com/cricket-news/": article_links = soup.findAll( 'a', attrs={'href': re.compile(".*-.*-")})[0:10] news_title = "CricTracker" elif links == "https://techcrunch.com/": article_links = soup.findAll('a')[0:20] news_title = "TechCrunch" elif links == "https://www.rvcj.com/": article_links = soup.findAll( 'a', attrs={'href': re.compile(".*-.*-")})[0:10] news_title = "RVCJ" elif links == "https://www.theverge.com/": article_links = list( set( soup.findAll( 'a', attrs={ 'href': re.compile(".*//.*/.*/.*/.*/.*/.*") })))[0:10] news_title = "The Verge" elif links == "https://www.digitaltrends.com/": article_links = list( set( soup.findAll( 'a', attrs={'href': re.compile(".*-.*-")})))[0:10] news_title = "Digital Trends" elif links == "https://www.bbc.com/": article_links = list( set( soup.findAll( 'a', attrs={'href': re.compile(".*-.*")})))[0:10] news_title = "BBC" elif links == "https://www.economist.com/": article_links = list( set( soup.findAll( 'a', attrs={'href': re.compile(".*/.*/.*")})))[0:10] news_title = "Economist" elif links == "https://economictimes.indiatimes.com/": article_links = list( set( soup.findAll( 'a', attrs={'href': re.compile(".*/.*/.*")})))[0:10] news_title = "Economic Times" elif links == "https://www.financialexpress.com/": article_links = list( set( soup.findAll( 'a', attrs={'href': re.compile(".*/.*/.*/.*")})))[0:10] news_title = "Financial Express" elif links == "https://www.theguardian.com/international": article_links = list( set( soup.findAll('a', attrs={ 'href': re.compile(".*/.*/.*/.*/.*") })))[0:10] news_title = "Guardian" elif links == "https://edition.cnn.com/": article_links = list( set( soup.findAll( 'a', attrs={'href': re.compile(".*/.*/.*")})))[0:10] news_title = "CNN" elif links == "https://www.foxnews.com/": article_links = list( set( soup.findAll( 'a', attrs={'href': re.compile(".*/.*/.*")})))[0:10] news_title = "Fox News" elif links == "https://www.aljazeera.com/": article_links = list( set(soup.findAll('a', attrs={'href'})))[0:10] news_title = "Al Jazeera" elif links == "https://www.vccircle.com/": article_links = list( set(soup.findAll('a', attrs={'href'})))[0:10] news_title = "VC Circle" elif links == "https://www.engadget.com/": article_links = list( set( soup.findAll( 'a', attrs={'href': re.compile(".*/.*/.*")})))[0:10] news_title = "EndGadget" elif links == "https://www.espn.in/": article_links = list(set(soup.findAll('a')))[0:10] news_title = "ESPN" #print("article_links",len(article_links)) for link in article_links: if links == "https://www.livemint.com/": link1 = link.get("data-weburl") else: link1 = link.get('href') if link1 == None: continue elif len(link1) < len( links ) + 5 or link1 == None or "/subscribe/" in link or "/login/" in link or "/register/" in link or "/sign-in/" in link or "/www.twitter.com" in link or "/www.facebook.com" in link or "/www.google.com" in link or "/plus.google.com" in link: continue elif "https://www.foxnews.com//www.foxnews.com/" in link1: link1 = link1.replace( "https://www.foxnews.com//www.foxnews.com/", "https://www.foxnews.com/") #link1=link.get('href') #print("link1",link1) elif (not "http://" in link1 and not "https://" in link1 ) or "https://www.hindustantimes.com/" in link1: if "/" == link1[1:]: link1 = link1[1:] link2 = links + link1 url = link2 print("/ in ", url) else: if links in link1: link2 = link1 else: link2 = links + link1 #print(link2) print("link2", link2) url = link2.replace("//", '/').replace( "http:/", 'http://').replace("https:/", 'https://') print("/ not in ", url) else: url = link1 if url in pd1 or url in pd2: continue else: try: pd1.append(url) article = Article(url) article.download() except: continue article.html try: article.parse() except: continue today = datetime.date.today() dow_time = datetime.datetime.now().time() auther = article.authors #print("article writer",auther) title = article.title title = title.replace(";", ",") title = title.replace("’", " ") print("title", title) if title == None: print("breakssssssssssssssssssss") continue if title.find("^Facebook$") == -1 or title.find( "^reddit.com:$") == -1 or title.find( "^linkedin$") == -1 or title.find( "^Twitter$") == -1: titles = title else: print("breakssssssssssssssssssss") continue print("article title : ", titles) titles = titles.replace("‘", " ") titles = titles.replace("-", " ") titles = titles.replace("“", " ") titles = titles.replace("”", " ") text = article.text texts = text.replace(";", ",") texts = texts.replace("’", " ") texts = texts.replace("‘", " ") texts = texts.replace("-", " ") texts = texts.replace("“", " ") texts = texts.replace("”", " ") #print("article content : ",text) image_url = article.top_image image_url = image_url.replace(";", ",") image_url = image_url.replace("’", " ") #print("article image link: ",image_url) down_Date = today #print("article download date :", down_Date) d = articleDateExtractor.extractArticlePublishedDate( url) publish_date = d #print("Publish date",publish_date) try: publish_date1 = [ publish_date.day, publish_date.month, publish_date.year ] publish_date1 = str(publish_date1) publish_date1 = publish_date1.replace(",", "-") publish_date1 = publish_date1.replace("[", " ") publish_date1 = publish_date1.replace("]", " ") publish_time1 = [ publish_date.hour, publish_date.minute ] publish_time1 = str(publish_time1) publish_time1 = publish_time1.replace(",", ":") publish_time1 = publish_time1.replace("[", " ") publish_time1 = publish_time1.replace("]", " ") except: publish_date1 = "" publish_time1 = "" filename = (f'NewsArticle{ct}') filename = filename.replace(":", "+") filename = filename.replace(" ", "") filename = filename + '.csv' #with open(f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{mon}\\{day}\\{filename}', 'a') as csv_file: with open( f'/home/admin-pc/Desktop/Article/Scrapy/{mon}/{day}/NewsArticle{ct}.csv', 'a') as csv_file: writer = csv.writer(csv_file) if i == 0: writer.writerow([ "Headline", "Name of Site", "Article URL", "Article Text", "Image URL", "Download date", "Download Time", "News Date(DD/MM/YYYY)", "News Time(HH:MM)" ]) else: wri = writer.writerow([ str(titles.encode("utf-8"))[2:-1], str(news_title.encode("utf-8"))[2:-1], str(url.encode("utf-8"))[2:-1], str(texts.encode("utf-8"))[2:-1], str(image_url.encode("utf-8"))[2:-1], down_Date, dow_time, publish_date1, publish_time1 ]) i = i + 1 else: continue else: continue #return render(request, 'index.html',{}) #sites_values=request.POST['Sites'] # if request.method == 'POST': # recommendations=request.POST.getlist("Sites") # print("recommendations",recommendations) # return render(request, 'index.html') return render(request, 'index.html', {})
def scrapy(): pd1 = [ [], ] pd2 = [] now = datetime.datetime.now() x = datetime.datetime.now() x1 = now.year today = datetime.date.today() mon1 = x.strftime("%b") mon = mon1 + str(x1) day1 = now.day d = day1 day = str(day1) + mon1 first = today.replace(day=1) lastmonth = first - datetime.timedelta(days=1) lastmonth = lastmonth.strftime("%b") lastmonths = lastmonth + str(x1) yesderdate = datetime.datetime.strftime(x - timedelta(1), '%d') yesderdates = str(yesderdate) + lastmonth yesda = str(yesderdate) + mon1 ct = strftime("%I:%M %p") existsm = os.path.exists( f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{mon}') if existsm: pass else: os.mkdir( f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{mon}') existsm = os.path.exists( f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{mon}\\{day}') if existsm: pass else: os.mkdir( f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{mon}\\{day}' ) try: for filename in os.listdir( f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{mon}\\{day}\\' ): if existsm: if filename.endswith(".csv"): co = pd.read_csv( f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{mon}\\{day}\\{filename}', delimiter=',').values.tolist() #print("Current File",co) pd1 = pd1 + co else: pass except: pass if day1 == "1" or day1 == 1: for filename in os.listdir( f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{lastmonths}\\{yesderdates}\\' ): if filename.endswith(".csv"): co1 = pd.read_csv( f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{lastmonths}\\{yesderdates}\\{filename}', delimiter=',').values.tolist() pd1 = pd1 + co1 else: try: for filename in os.listdir( f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{mon}\\{yesda}\\' ): if filename.endswith(".csv"): co1 = pd.read_csv( f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{mon}\\{yesda}\\{filename}', delimiter=',').values.tolist() pd1 = pd1 + co1 #print("Old File",co1) except: pass if pd1 != [] or pd1 != "": for j in pd1: for k in j: j1 = pd2.append(k) i = 0 list = [ "https://timesofindia.indiatimes.com/", "https://aninews.in", "https://indiatoday.in" ] list = [ "https://inshorts.com/en/read", ] for links in list: if links != None or links != " ": response = requests.get(links) data = response.text soup = BeautifulSoup(data, "html.parser") #.encode("utf-8") if soup != None or soup != " ": if links == "https://inshorts.com/en/read": article_links = soup.findAll( 'a', attrs={'href': re.compile("/articleshow/")}) news_title = "In Shorts" else: continue # if links == "https://timesofindia.indiatimes.com/": # article_links = soup.findAll('a', attrs={'href': re.compile("/articleshow/")}) # news_title="The Times of India" # elif links == "https://aninews.in": # article_links = soup.findAll('a', attrs={'href': re.compile("/news/")}) # news_title = "ANI NEWS" # elif links == "https://indiatoday.in" : # article_links = soup.findAll('a', attrs={'href': re.compile("/story/")}) # news_title="India Today" #print("article_links",len(article_links)) for link in article_links: link1 = link.get('href') if not "http://" in link1 and not "https://" in link1: if "/" == link1[1:]: link1 = link1[1:] link2 = links + link1 #print(link2) url = link2 print("url", url) continue else: link2 = links + link1 #print(link2) url = link2 print("url", url) continue if url in pd1 or url in pd2: continue else: try: pd1.append(url) article = Article(url) article.download() except: continue article.html article.parse() today = datetime.date.today() dow_time = datetime.datetime.now().time() auther = article.authors #print("article writer",auther) title = article.title titles = title.replace(";", ",") #print("article title : ",title) text = article.text texts = text.replace(";", ",") #print("article content : ",text) image_url = article.top_image #print("article image link: ",image_url) down_Date = today #print("article download date :", down_Date) d = articleDateExtractor.extractArticlePublishedDate( url) publish_date = d #print("Publish date",publish_date) try: publish_date1 = [ publish_date.day, publish_date.month, publish_date.year ] publish_date1 = str(publish_date1) publish_date1 = publish_date1.replace( ",", "-") publish_date1 = publish_date1.replace( "[", " ") publish_date1 = publish_date1.replace( "]", " ") publish_time1 = [ publish_date.hour, publish_date.minute ] publish_time1 = str(publish_time1) publish_time1 = publish_time1.replace( ",", ":") publish_time1 = publish_time1.replace( "[", " ") publish_time1 = publish_time1.replace( "]", " ") except: publish_date1 = "" publish_time1 = "" filename = (f'NewsArticle{ct}') filename = filename.replace(":", "+") filename = filename.replace(" ", "") filename = filename + '.csv' with open( f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{mon}\\{day}\\{filename}', 'a') as csv_file: writer = csv.writer(csv_file) if i == 0: writer.writerow([ "Headline", "Name of Site", "Article URL", "Article Text", "Image URL", "Download date", "Download Time", "News Date(DD/MM/YYYY)", "News Time(HH:MM)" ]) else: wri = writer.writerow([ str(titles.encode("utf-8"))[2:-1], str(news_title.encode( "utf-8"))[2:-1], str(url.encode("utf-8"))[2:-1], str(texts.encode("utf-8"))[2:-1], str(image_url.encode("utf-8")) [2:-1], down_Date, dow_time, publish_date1, publish_time1 ]) i = i + 1 else: continue else: continue
def parse_article(self, response): news_id = 19684 #response.meta.get('news_id') # save to file with open(str(news_id) + '.html', 'wb') as fh: fh.write(response.body) article = Article(response.url) # set html manually with open(str(news_id) + '.html', 'rb') as fh: article.html = fh.read() os.remove(str(news_id) + '.html') # need to set download_state to 2 for this to work article.download_state = 2 article.parse() article.nlp() date = article.publish_date keywords = str([x.replace("'", "''") for x in article.keywords]).replace('"', '\'') content = article.text.replace("'", "''") summary = article.summary.replace("'", "''") title = article.title.replace("'", "''") if date is None: date = 'null' else: date = "'" + str(date) + "'" authors = str([x.replace("'", "''") for x in article.authors]).replace('"', '\'') tags = str([x.replace("'", "''") for x in article.meta_keywords]).replace('"', '\'') dbconnector.execute( self.conn, 'INSERT INTO "ParsedNews-newspaper"("IDNews", "Date", "Content", "Keywords", ' + '"Summary", "Authors", "Tags", "Title") ' + 'VALUES (' + str(news_id) + ', ' + str(date) + ', \'' + content + '\', ARRAY ' + str(keywords) + '::text[], \'' + summary + '\', ARRAY ' + str(authors) + '::text[], ARRAY ' + str(tags) + '::text[], \'' + title + '\')') # get main article without comments content = extract_content(response.text).replace("'", "''") # get article and comments content_comments = '[\'' + extract_content_and_comments( response.text).replace("'", "''") + '\']' dbconnector.execute( self.conn, 'INSERT INTO "ParsedNews-dragnet"("IDNews", "Content", "Comments") ' + 'VALUES (' + str(news_id) + ', \'' + content + '\', ARRAY ' + str(content_comments) + '::text[])') date = articleDateExtractor.extractArticlePublishedDate( articleLink=response.url, html=response.text) if date is not None: dbconnector.execute( self.conn, 'INSERT INTO "ParsedNews-ade"("IDNews", "Date") ' + 'VALUES (' + str(news_id) + ', \'' + str(date) + '\')') g = Goose() article = g.extract(raw_html=response.text) date = article.publish_datetime_utc keywords = str([x.replace("'", "''") for x in article.tags]).replace('"', '\'') content = article.cleaned_text.replace("'", "''") summary = article.meta_description.replace("'", "''") title = article.title.replace("'", "''") if date is None: date = 'null' else: date = "'" + str(date) + "'" authors = str([x.replace("'", "''") for x in article.authors]).replace('"', '\'') tags = str([ x.replace("'", "''") for x in article.meta_keywords.split(",") ]).replace('"', '\'') tweets = str([x.replace("'", "''") for x in article.tweets]).replace('"', '\'') dbconnector.execute( self.conn, 'INSERT INTO "ParsedNews-goose"(' + '"IDNews", "Date", "Content", "Keywords", "Summary", ' + '"Authors", "Tags", "Tweets",' + '"Title") VALUES (' + str(news_id) + ', ' + date + ', \'' + content + '\', ARRAY ' + str(keywords) + '::text[], \'' + str(summary) + '\', ARRAY ' + str(authors) + '::text[], ARRAY ' + str(tags) + '::text[], ARRAY ' + str(tweets) + '::text[], \'' + str(title) + '\')') pass
import articleDateExtractor website = "https://edition.cnn.com/2019/07/21/europe/bulgaria-hack-tax-intl/index.html" d = articleDateExtractor.extractArticlePublishedDate(website) print (d)
cursor = mariadb_connection.cursor() #retrieving information cursor.execute("SELECT url FROM skcript") data = cursor.fetchall() for text in data: try: url = text[0] article = Article(url) article.download() article.parse() try: cursor.execute( "UPDATE skcript set author={!a},charCount='{:d}',title={!a} where url='{!s}'" .format("".join(article.authors), len(article.text), article.title, url)) except mariadb.Error as error: print("Error: {}".format(error)) d = articleDateExtractor.extractArticlePublishedDate(url) try: cursor.execute("UPDATE skcript set date='{:%Y-%m-%d}'".format(d)) except (TypeError, mariadb.Error): print("date error") except ArticleException: continue mariadb_connection.commit() mariadb_connection.close()