예제 #1
0
def articleDateExtractorFunc(urlParam):
    old_stdout = sys.stdout  # backup current stdout
    sys.stdout = open(os.devnull, "w")
    date = articleDateExtractor.extractArticlePublishedDate(urlParam)
    sys.stdout.close()
    sys.stdout = old_stdout  # reset old stdout
    return date
예제 #2
0
def Parser( url ):
    
    search = 'https://api.aylien.com/api/v1/extract?url=' + url

    headers = {
        "X-AYLIEN-TextAPI-Application-ID":"f94984be",
        "X-AYLIEN-TextAPI-Application-Key":"83a7b904239577d9967e5402c461f388"
    }

    req = requests.get(url = search, headers=headers) 
    data = req.json()

    date = articleDateExtractor.extractArticlePublishedDate(url)

    formattedDate = date
    if( date != None ):
        formattedDate = str(date).replace("-", "")
        formattedDate = formattedDate[:-9]
        formattedDate = int(formattedDate)

    parsed = {
        'title': data['title'],
        'author': data['author'],
        'article': data['article'],
        'date': formattedDate
    }

    return parsed
예제 #3
0
def run_articledateextractor(htmlstring):
   '''try with articleDateExtractor'''
   dateresult = extractArticlePublishedDate('', html=htmlstring)
   if dateresult is None:
      return None
   date = convert_date(dateresult, '%Y-%m-%d %H:%M:%S', '%Y-%m-%d')
   return date
예제 #4
0
    def parseURL(self, url):
        # create Newspaper3k object
        news3 = NewsArticle(url)

        # download and parse the article
        try:
            news3.download()
            news3.parse()
        except:
            self.logerror(url, "Download Failed") # protect from failure with a try except
            return
        
        # check date / end
        pub = news3.publish_date # try to get the date from Newspaper3k
        access = datetime.today() # we accessed this rn
        
        # second date attempt if first failed
        if (pub == None):
            pub = getdate.extractArticlePublishedDate(url)
            
        # we need the date so if we cant find it we simply throw this article away
        if (pub == None):
            self.logerror(url, "Date Detection Failed - Missing")
            return
        
        # sometimes the date thing gets the wrong date - WACK
        # but to be smart we'll throw anything out that is outside
        # of our search range and pray to Jesus that the rest are accurate
        pub = str_to_datetime(pub)
        if (pub < str_to_datetime(self.search_after) or pub > str_to_datetime(self.search_before)):
            self.logerror(url, "Date Detection Failed - Out of Bounds")
            return
        
        # TextBlob Analysis
        textBlobObj = TextBlob(news3.text, analyzer = self.analyzer) # create textBlob object
        #textBlobTwo = TextBlob(news3.text, analyzer = self.analyzer)
        
        # language / end
        try:
            lang = textBlobObj.detect_language()
            if (lang != 'en'):
                self.logerror(url, "Non English Article")
                return # we only want to deal with English articles
        except:
            self.logerror(url, "Language Detection Failed")
            return
            
        # have TextBlob calculate sentiment
        try:
            sentiment = textBlobObj.sentiment
        except Exception as e:
            print(e)
            self.logerror(url, "Sentiment Analysis Failed")
            return
        
        # Add to article list
        this_article = Article(url, sentiment.classification, sentiment.p_pos, sentiment.p_neg, news3.title, pub, access, ' + '.join(news3.authors), news3.text)
        this_article.output(self.output_filename) #output to files
        this_article.output(GABE_GLOBAL_OUTPUT) # output again
        self.articles.append(this_article) # add to our list
예제 #5
0
    def postprocessing(self, handle=True):
        if handle:
            handl = urlhandlers.handle_by_url(self)
            self.mass_media = handl['izd_name']
            self.publish_date = handl['pubdate']
            self.text = handl['news_text']
            self.title = handl['news_title']

        self.fulltext = self.text

        if not FULL_TEXT:
            self.paragraphs = self.text.split('\n')
            self.text = self.paragraphs[0]

        if not self.publish_date:
            self.publish_date = articleDateExtractor.extractArticlePublishedDate(
                self.url, self.html)

        if not isinstance(self.publish_date, str) and self.publish_date:
            publish_date = self.publish_date
            self.publish_date = str(publish_date.day) + '.0' + str(
                publish_date.month) + '.' + str(publish_date.year)

        if not self.mass_media:
            clean_path = urlsplit(self.url)
            base_url = clean_path.netloc
            self.mass_media = base_url.replace('www.', '')
예제 #6
0
 def get_date(
     self, list_of_links
 ):  #<get_date> function extracts the publishing date of the news articles
     dates = []
     for link in list_of_links:
         date = articleDateExtractor.extractArticlePublishedDate(link)
         dates.append(date)
     return dates
def get_date(url):
    try:
        article_date = articleDateExtractor.extractArticlePublishedDate(url)
        article_date = article_date.strftime('%Y-%m-%d')
    except BaseException as error:
        print('error: {}'.format(error))
        article_date = 'error: {}'.format(error)
    return article_date
예제 #8
0
 def getdate(self, params):
     #parse pdate
     pdate = ""
     try:
         docrsp = doc(params['html'], params['url'])
         pubdate = articleDateExtractor.extractArticlePublishedDate(
             params['url'], docrsp.html())
         if pubdate:
             pdate = ValidateTime(int(time.mktime(pubdate.timetuple())))
     except Exception as e:
         pass
     return pdate
예제 #9
0
 def parse_item(self, response):
     try:
         g = Goose()
         article = g.extract(raw_html=response.body)
         item = NeocrawlItem()
         item['totalnews'] = response.body
         item['title'] = article.title
         item['url'] = response.url
         item['date'] = articleDateExtractor.extractArticlePublishedDate(
             response.body)
         item['meta'] = article.meta_description
         item['newsdesc'] = article.cleaned_text
         item['tokens'] = nltk.word_tokenize(article.cleaned_text)
         yield item
     except Exception:
         pass
예제 #10
0
def ArticleDateParser( paras ):
    html = paras['html']
    url = paras['url']
    parserTable = {}
    item = {}
    try:
        item['parser'] = 'ArticleDate'
        docrsp = doc(html,url)
        pubdate = articleDateExtractor.extractArticlePublishedDate(url, docrsp.html())
        if pubdate:
            item['pdate'] = int(time.mktime(pubdate.timetuple()))
            item['pdate'] = ValidateTime( item['pdate'] )
        return item
    except Exception , e:
        print e
        return item
예제 #11
0
def ArticleDateParser(paras):
    html = paras['html']
    url = paras['url']
    parserTable = {}
    item = {}
    try:
        item['parser'] = 'ArticleDate'
        docrsp = doc(html, url)
        pubdate = articleDateExtractor.extractArticlePublishedDate(
            url, docrsp.html())
        if pubdate:
            item['pdate'] = int(time.mktime(pubdate.timetuple()))
            item['pdate'] = ValidateTime(item['pdate'])
        return item
    except Exception, e:
        print e
        return item
예제 #12
0
def main():
    # Read from stdin
    readIn = read_in()

    # Parse into JSON
    data = json.loads(readIn)

    if ("url" not in data or "html" not in data):
        print("None")
        return

    url = data["url"]
    html = data["html"]

    # TODO: Temporarily re-route stdout until d is obtained and update JS reader
    d = articleDateExtractor.extractArticlePublishedDate(url, html)

    print(d)
예제 #13
0
    def parse_item(self, response):

        # parsed_uri = urlparse(response.url)
        date = articleDateExtractor.extractArticlePublishedDate(response.url)
        if date == None:
            date = find_date(response.url)
        print "date ", str(date)[0:10]
        if str(date)[0:10] == DATE:
            new_text = extract_text(response.url)
            try:
                cur.execute(
                    "INSERT INTO NEWS_DATA (LINK, ARTICLE_TEXT, ARTICLE_DATE, INSERTION_DATE, ARTICLE_TYPE) values (?,?,?,?,?)",
                    (response.url, new_text, str(date)[0:10],
                     datetime.now().strftime("%Y-%m-%d"), 'NON-RSS'))
                conn.commit()
                print "Record Added"
            except sqlite3.Error as e:
                print("Error Writing in the DB.. " + e.args[0])
예제 #14
0
    def prosess_content(self, url):
        article = Article(url)

        article.download()
        # article.html
        article.parse()

        dbthings = db_things.DBThings()
        parser = Parser()

        if article.authors:
            self.authors = ','.join(map(str, article.authors))
        if article.keywords:
            self.keywords = ','.join(map(str, article.keywords))

        publish_date = articleDateExtractor.extractArticlePublishedDate(
            article.url)
        # time.sleep(5)

        parser = HtmlParser.from_url(url, Tokenizer('english'))
        # or for plain text files
        # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
        stemmer = Stemmer('english')

        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words('english')

        all_sentences = ''

        for sentence in summarizer(parser.document, 10):
            all_sentences += (sentence._text + '\n')

        # TODO: Pay for better license to speed up this process
        # time.sleep(80)
        # classifier = Classifier()
        # category = classifier.classify_news(data=all_sentences)
        category = 'General'

        if publish_date is not None:
            dbthings.insert_extracted(self.authors, str(publish_date),
                                      all_sentences.encode('utf-8', 'ignore'),
                                      article.top_image, self.keywords,
                                      article.url, article.title, category)
        return
예제 #15
0
def time_score(url):

    d = articleDateExtractor.extractArticlePublishedDate(url)
    currentDT = (datetime.now())
    temp = 0.001
    print d
    print currentDT
    if d.year == currentDT.year:
        if d.month == currentDT.month:
            if d.day == currentDT.day:
                temp = 0.4
            elif abs(d.day - currentDT.day) == 1:
                temp = 0.3
            elif abs(d.day - currentDT.day) == 2:
                temp = 0.2
            elif abs(d.day - currentDT.day) == 3:
                temp = 0.1
            elif abs(d.day - currentDT.day) == 4:
                temp = 0.05
            else:
                temp = 0.01
    print "Time score is: ", temp
    return temp
def addArticle(featureClassList, prefix, articleLabel, articleURL, articleObj):
    articleTitle = articleObj.title
    authors = articleObj.authors
    articleText = articleObj.text
    
    articleTitle = re.sub(r'[\'\,\.\"\\\/\!\@\$\%\&\*]+', '', articleTitle)
    filename = prefix + '_' + re.sub(r'\W+', '', articleTitle)
    filepath = filename[:24] + '.txt'
    if (articleLabel == credibleLabel):
        filepath = './credible/' + 'r_' + filepath
    else:
        filepath = './malicious/' + 'f_' + filepath
    
    saveArticleContents(filepath, articleText)
    
    numChar = len(articleText)
    numWords = len(articleText.split())
    articleDate =  'NULL'
    d = articleDateExtractor.extractArticlePublishedDate(articleURL)
    if (type(d) == datetime.datetime):
        articleDate = d.date()
    
    featureClassList.write(articleLabel + ',' + articleURL + ',' + '"' + filepath + '"' + ',' + '"' + articleTitle + '"' + ',' + str(len(authors)) + ',' + str(numChar) + ',' + str(numWords) + ',' + articleDate + '\n')    
    return
예제 #17
0
def Archiv_Crawler_MM(Starting_Date, Ending_Date=datetime.datetime.now()):

    Link_list = []
    Date_list = []

    Year = datetime.datetime.now().timetuple()[0]

    quote_page = 'http://www.manager-magazin.de/unternehmen/archiv-' + str(
        Year) + '999.html'

    Right_Arrow = 'SENSELESS CONTENT'

    while Right_Arrow != None:
        ### HTML herunterladen
        if Right_Arrow == 'SENSELESS CONTENT':
            page = urllib.request.urlopen(quote_page)
        else:
            page = urllib.request.urlopen(Right_Arrow_Link)

        soup = BeautifulSoup(page, 'html.parser')
        ### Artikel-Verlinkungen nehmen
        name_box = soup.find_all('h2', attrs={'class': 'article-title'})
        ### Allen Artikeln auf der aktuellen Seite die Links entnehmen und in die Liste fügen
        for i in range(len(name_box)):
            if 'http' in name_box[i].find('a').get('href'):
                URL = name_box[i].find('a').get('href')
            else:
                URL = 'http://www.manager-magazin.de' + name_box[i].find(
                    'a').get('href')
            Timing = articleDateExtractor.extractArticlePublishedDate(URL)
            Time = datetime.datetime(
                Timing.timetuple()[0],
                Timing.timetuple()[1],
                Timing.timetuple()[2],
            )
            if Time > Ending_Date:
                continue
            if Time >= Starting_Date:
                Link_list.append(URL)
                Date_list.append(Time)
                ### Den nächste Seite Button finden
                Right_Arrow = soup.find('a', attrs={'class': 'next'})
                ### Wenn Button existiert, dann Link speichern
                Right_Arrow_Link = 'http://www.manager-magazin.de' + Right_Arrow.get(
                    'href')
            else:
                Right_Arrow = None
                break  # man brauch nicht weiter zurück gehen

    quote_page = 'http://www.manager-magazin.de/finanzen/archiv-' + str(
        Year) + '999.html'

    Right_Arrow = 'SENSELESS CONTENT'

    while Right_Arrow != None:
        ### HTML herunterladen
        if Right_Arrow == 'SENSELESS CONTENT':
            page = urllib.request.urlopen(quote_page)
        else:
            page = urllib.request.urlopen(Right_Arrow_Link)

        soup = BeautifulSoup(page, 'html.parser')
        ### Artikel-Verlinkungen nehmen
        name_box = soup.find_all('h2', attrs={'class': 'article-title'})
        ### Allen Artikeln auf der aktuellen Seite die Links entnehmen und in die Liste fügen
        for i in range(len(name_box)):
            if 'http' in name_box[i].find('a').get('href'):
                URL = name_box[i].find('a').get('href')
            else:
                URL = 'http://www.manager-magazin.de' + name_box[i].find(
                    'a').get('href')
            Timing = articleDateExtractor.extractArticlePublishedDate(URL)
            Time = datetime.datetime(
                Timing.timetuple()[0],
                Timing.timetuple()[1],
                Timing.timetuple()[2],
            )
            if Time > Ending_Date:
                continue
            if Time >= Starting_Date:
                Link_list.append(URL)
                Date_list.append(Time)
                ### Den nächste Seite Button finden
                Right_Arrow = soup.find('a', attrs={'class': 'next'})
                ### Wenn Button existiert, dann Link speichern
                Right_Arrow_Link = 'http://www.manager-magazin.de' + Right_Arrow.get(
                    'href')
            else:
                Right_Arrow = None
                break  # man brauch nicht weiter zurück gehen

    quote_page = 'http://www.manager-magazin.de/politik/archiv-' + str(
        Year) + '999.html'

    Right_Arrow = 'SENSELESS CONTENT'

    while Right_Arrow != None:
        ### HTML herunterladen
        if Right_Arrow == 'SENSELESS CONTENT':
            page = urllib.request.urlopen(quote_page)
        else:
            page = urllib.request.urlopen(Right_Arrow_Link)

        soup = BeautifulSoup(page, 'html.parser')
        ### Artikel-Verlinkungen nehmen
        name_box = soup.find_all('h2', attrs={'class': 'article-title'})
        ### Allen Artikeln auf der aktuellen Seite die Links entnehmen und in die Liste fügen
        for i in range(len(name_box)):
            if 'http' in name_box[i].find('a').get('href'):
                URL = name_box[i].find('a').get('href')
            else:
                URL = 'http://www.manager-magazin.de' + name_box[i].find(
                    'a').get('href')
            Timing = articleDateExtractor.extractArticlePublishedDate(URL)
            Time = datetime.datetime(
                Timing.timetuple()[0],
                Timing.timetuple()[1],
                Timing.timetuple()[2],
            )
            if Time > Ending_Date:
                continue
            if Time >= Starting_Date:
                Link_list.append(URL)
                Date_list.append(Time)
                ### Den nächste Seite Button finden
                Right_Arrow = soup.find('a', attrs={'class': 'next'})
                ### Wenn Button existiert, dann Link speichern
                Right_Arrow_Link = 'http://www.manager-magazin.de' + Right_Arrow.get(
                    'href')
            else:
                Right_Arrow = None
                break  # man brauch nicht weiter zurück gehen

    return Link_list, Date_list
예제 #18
0
def article(text):
    try:
        try:
            url = text[0]
            article = Article(url)
            article.download()
            slept = 0
            while article.download_state == ArticleDownloadState.NOT_STARTED:
                # Raise exception if article download state does not change after 10 seconds
                if slept > 9:
                    raise ArticleException('Download never started')
                sleep(1)
                slept += 1
            article.parse()
            article.nlp()
            mariadb_connectionT = mariadb.connect(
                host='127.0.0.1',
                user='******',
                password='******',
                database='condense')
            cursor = mariadb_connectionT.cursor()
            # if article.canonical_link and article.canonical_link != url:
            #     cursor.execute("SELECT fbshares,url FROM `{!s}` where url='{!s}'".format(
            #         domain, article.canonical_link))
            #     data0 = cursor.fetchone()
            #     if data0:
            #         cursor.execute(
            #             "SELECT fbshares  FROM `{!s}` where url='{!s}'".format(domain, url))
            #         data1 = cursor.fetchone()
            #         if int(data1[0] or 0) < int(data0[0] or 0):
            #             cursor.execute(
            #                 "delete FROM `{!s}` where url='{!s}'".format(domain, url))
            #             mariadb_connectionT.commit()
            #             return
            #         else:
            #             cursor.execute("delete FROM `{!s}` where url='{!s}'".format(
            #                 domain, article.canonical_link))
            #             mariadb_connectionT.commit()
            #     else:
            #         cursor.execute("update `{!s}` set url='{!s}' where url='{!s}'".format(
            #             domain, article.canonical_link, url))
            #         mariadb_connectionT.commit()
            article.nlpEntropy()
            keywords = article.keywords
            keywords = ' '.join(keywords)
            d = article.publish_date
            author = "".join(article.authors)
            if len(author) > 30 or not author:
                author = ""
            img = article.top_image
            if not d:
                d = articleDateExtractor.extractArticlePublishedDate(
                    url, article.html)
            if not d:
                return
            cursor.execute(
                "UPDATE `{!s}` set isArticleData = '1', keywords = {!a}, image = {!a}, author={!a} , charCount='{:d}',wordCount='{:d}',stopWords='{:d}',titleCount='{:d}', imgCount = '{:d}', title={!a}, date='{:%Y-%m-%d}' where url='{!s}'"
                .format(domain, keywords, img, author, len(article.text),
                        article.totalWords, article.stopWords,
                        len(article.title), len(article.imgs), article.title,
                        d, url))
            mariadb_connectionT.commit()
        except mariadb.Error as err:
            print("db error", err)
        except ValueError as err:
            print("Value Error", url)
            print(err)
        except TypeError as err:
            print("Type Error", url)
            print(err)
        except ArticleException:
            print("Article exception", url)
            return
    finally:
        if cursor:
            cursor.close()
        mariadb_connectionT.close()
예제 #19
0
def scrapy(request):
    if request.method == 'POST':
        #if request.method == 'POST':
        recommendations = request.POST.getlist("Sites")
        print("recommendations", recommendations)

        pd1 = [
            [],
        ]
        pd2 = []
        now = datetime.datetime.now()
        x = datetime.datetime.now()
        x1 = now.year
        today = datetime.date.today()
        mon1 = x.strftime("%b")
        mon = mon1 + str(x1)
        day1 = now.day
        d = day1
        day = str(day1) + mon1
        first = today.replace(day=1)
        lastmonth = first - datetime.timedelta(days=1)
        lastmonth = lastmonth.strftime("%b")
        lastmonths = lastmonth + str(x1)
        yesderdate = datetime.datetime.strftime(x - timedelta(1), '%d')
        yesderdates = str(yesderdate) + lastmonth
        yesda = str(yesderdate) + mon1
        ct = strftime("%I:%M %p")
        #existsm = os.path.exists(f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{mon}')
        existsm = os.path.exists(
            f'/home/admin-pc/Desktop/Article/Scrapy/{mon}')
        if existsm:
            pass
        else:
            #os.mkdir(f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{mon}')
            os.mkdir(f'/home/admin-pc/Desktop/Article/Scrapy/{mon}')

        existsm = os.path.exists(
            f'/home/admin-pc/Desktop/Article/Scrapy/{mon}/{day}')
        #existsm = os.path.exists(f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{mon}\\{day}')
        if existsm:
            pass
        else:
            #os.mkdir(f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{mon}\\{day}')
            os.mkdir(f'/home/admin-pc/Desktop/Article/Scrapy/{mon}/{day}')

        try:
            #for filename in os.listdir(f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{mon}\\{day}\\'):
            for filename in os.listdir(
                    f'/home/admin-pc/Desktop/Article/Scrapy/{mon}/{day}/'):
                if existsm:
                    if filename.endswith(".csv"):
                        #co=pd.read_csv(f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{mon}\\{day}\\{filename}', delimiter = ',').values.tolist()
                        co = pd.read_csv(
                            f'/home/admin-pc/Desktop/Article/Scrapy/{mon}/{day}/{filename}',
                            delimiter=',').values.tolist()
                        #print("Current File",co)
                        pd1 = pd1 + co
                else:
                    pass

        except:
            pass
        if day1 == "1" or day1 == 1:
            for filename in os.listdir(
                    f'/home/admin-pc/Desktop/Article/Scrapy/{lastmonths}/{yesderdates}/'
            ):
                #for filename in os.listdir(f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{lastmonths}\\{yesderdates}\\'):

                if filename.endswith(".csv"):
                    #co1=pd.read_csv(f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{lastmonths}\\{yesderdates}\\{filename}', delimiter = ',').values.tolist()
                    co1 = pd.read_csv(
                        f'/home/admin-pc/Desktop/Article/Scrapy/{lastmonths}/{yesderdates}/{filename}',
                        delimiter=',').values.tolist()
                    pd1 = pd1 + co1
        else:
            try:
                for filename in os.listdir(
                        f'/home/admin-pc/Desktop/Article/Scrapy/{mon}/{yesda}/'
                ):
                    #for filename in os.listdir(f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{mon}\\{yesda}\\'):
                    if filename.endswith(".csv"):
                        co1 = pd.read_csv(
                            f'/home/admin-pc/Desktop/Article/Scrapy/{mon}/{yesda}/{filename}',
                            delimiter=',').values.tolist()
                        #co1=pd.read_csv(f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{mon}\\{yesda}\\{filename}', delimiter = ',').values.tolist()
                        pd1 = pd1 + co1
                        #print("Old File",co1)
            except:
                pass
        if pd1 != [] or pd1 != "":
            for j in pd1:
                for k in j:
                    j1 = pd2.append(k)
        i = 0
        #list = ["https://www.engadget.com/","https://www.espn.in/","https://www.vccircle.com/","https://www.aljazeera.com/","https://www.foxnews.com/","https://edition.cnn.com/","https://www.theguardian.com/international","https://www.financialexpress.com/","https://economictimes.indiatimes.com/", "https://www.economist.com/","https://www.bbc.com/", "https://www.digitaltrends.com/","https://www.theverge.com/", "https://www.rvcj.com/" ,"https://techcrunch.com/","https://www.crictracker.com/cricket-news/","https://zeenews.india.com/","https://www.hindustantimes.com/","https://timesofindia.indiatimes.com/","https://www.timesnownews.com/","https://www.firstpost.com/tech","https://aninews.in","https://www.thehindu.com/","https://indiatoday.in","https://www.thequint.com/","https://inshorts.com/en/read","https://in.reuters.com/","https://indianexpress.com/","https://www.livemint.com/",]
        # list = recommendations
        for links in recommendations:
            if links != None or links != " ":
                response = requests.get(links)
                data = response.text
                soup = BeautifulSoup(data, "html.parser")  #.encode("utf-8")
                print()
                print()
                print()
                print()

                if soup != None or soup != " ":
                    if links == "https://timesofindia.indiatimes.com/":
                        article_links = soup.findAll(
                            'a', attrs={'href':
                                        re.compile("/articleshow/")})[0:10]
                        news_title = "The Times of India"
                    elif links == "https://aninews.in":
                        article_links = soup.findAll(
                            'a', attrs={'href': re.compile("/news/")})[0:10]
                        news_title = "ANI NEWS"
                    elif links == "https://indiatoday.in":
                        article_links = soup.findAll(
                            'a', attrs={'href': re.compile("/story/")})[0:10]
                        news_title = "India Today"
                    elif links == "https://www.thequint.com/entertainment":
                        article_links = soup.findAll(
                            'a', attrs={'href':
                                        re.compile("/entertainment/")})[0:10]
                        news_title = "The Quint"
                    elif links == "https://inshorts.com/en/read":
                        article_links = soup.findAll(
                            'a', attrs={'href': re.compile("/news/")})[0:10]
                        news_title = "In Shorts"
                    elif links == "https://in.reuters.com/":
                        article_links = soup.findAll(
                            'a', attrs={'href': re.compile("/article/")})[0:10]
                        news_title = "Reuters India"

                    elif links == "https://indianexpress.com/":
                        article_links = soup.findAll(
                            'a', attrs={'href': re.compile("/article/")})[0:10]
                        news_title = "Indian Express"

                    elif links == "https://www.thehindu.com/":
                        article_links = soup.findAll(
                            'a', attrs={'href':
                                        re.compile(".*//.*/.*/.*/.*")})[0:10]
                        news_title = "The Hindu"

                    elif links == "https://www.firstpost.com/tech":
                        article_links = soup.findAll(
                            'a', attrs={'href': re.compile("/tech/")})[0:10]
                        news_title = "Firstpost"
                    elif links == "https://www.timesnownews.com/":
                        article_links = soup.findAll(
                            'a', attrs={'href': re.compile("/article/")})[0:10]
                        news_title = "Times Now"
                    elif links == "https://www.pinkvilla.com/":
                        article_links = soup.findAll(
                            'a', attrs={'href':
                                        re.compile("/entertainment/")})[0:10]
                        news_title = "Pinkvilla"
                    elif links == "https://www.livemint.com/":
                        article_links = soup.findAll(
                            'section',
                            attrs={'data-weburl': re.compile(".html")})[0:10]
                        news_title = "Live Mint"
                    elif links == "https://www.hindustantimes.com/":
                        article_links = soup.findAll(
                            'a', attrs={'href': re.compile(".html")})[0:10]
                        news_title = "Hindustan Times"
                    elif links == "https://zeenews.india.com/":
                        article_links = soup.findAll(
                            'a', attrs={'href': re.compile(".html")})[0:10]
                        news_title = "Zee News India"
                    elif links == "https://www.crictracker.com/cricket-news/":
                        article_links = soup.findAll(
                            'a', attrs={'href': re.compile(".*-.*-")})[0:10]
                        news_title = "CricTracker"
                    elif links == "https://techcrunch.com/":
                        article_links = soup.findAll('a')[0:20]
                        news_title = "TechCrunch"
                    elif links == "https://www.rvcj.com/":
                        article_links = soup.findAll(
                            'a', attrs={'href': re.compile(".*-.*-")})[0:10]
                        news_title = "RVCJ"
                    elif links == "https://www.theverge.com/":
                        article_links = list(
                            set(
                                soup.findAll(
                                    'a',
                                    attrs={
                                        'href':
                                        re.compile(".*//.*/.*/.*/.*/.*/.*")
                                    })))[0:10]
                        news_title = "The Verge"
                    elif links == "https://www.digitaltrends.com/":
                        article_links = list(
                            set(
                                soup.findAll(
                                    'a', attrs={'href':
                                                re.compile(".*-.*-")})))[0:10]
                        news_title = "Digital Trends"
                    elif links == "https://www.bbc.com/":
                        article_links = list(
                            set(
                                soup.findAll(
                                    'a', attrs={'href':
                                                re.compile(".*-.*")})))[0:10]
                        news_title = "BBC"
                    elif links == "https://www.economist.com/":
                        article_links = list(
                            set(
                                soup.findAll(
                                    'a',
                                    attrs={'href':
                                           re.compile(".*/.*/.*")})))[0:10]
                        news_title = "Economist"
                    elif links == "https://economictimes.indiatimes.com/":
                        article_links = list(
                            set(
                                soup.findAll(
                                    'a',
                                    attrs={'href':
                                           re.compile(".*/.*/.*")})))[0:10]
                        news_title = "Economic Times"
                    elif links == "https://www.financialexpress.com/":
                        article_links = list(
                            set(
                                soup.findAll(
                                    'a',
                                    attrs={'href':
                                           re.compile(".*/.*/.*/.*")})))[0:10]
                        news_title = "Financial Express"
                    elif links == "https://www.theguardian.com/international":
                        article_links = list(
                            set(
                                soup.findAll('a',
                                             attrs={
                                                 'href':
                                                 re.compile(".*/.*/.*/.*/.*")
                                             })))[0:10]
                        news_title = "Guardian"
                    elif links == "https://edition.cnn.com/":
                        article_links = list(
                            set(
                                soup.findAll(
                                    'a',
                                    attrs={'href':
                                           re.compile(".*/.*/.*")})))[0:10]
                        news_title = "CNN"
                    elif links == "https://www.foxnews.com/":
                        article_links = list(
                            set(
                                soup.findAll(
                                    'a',
                                    attrs={'href':
                                           re.compile(".*/.*/.*")})))[0:10]
                        news_title = "Fox News"
                    elif links == "https://www.aljazeera.com/":
                        article_links = list(
                            set(soup.findAll('a', attrs={'href'})))[0:10]
                        news_title = "Al Jazeera"
                    elif links == "https://www.vccircle.com/":
                        article_links = list(
                            set(soup.findAll('a', attrs={'href'})))[0:10]
                        news_title = "VC Circle"
                    elif links == "https://www.engadget.com/":
                        article_links = list(
                            set(
                                soup.findAll(
                                    'a',
                                    attrs={'href':
                                           re.compile(".*/.*/.*")})))[0:10]
                        news_title = "EndGadget"
                    elif links == "https://www.espn.in/":
                        article_links = list(set(soup.findAll('a')))[0:10]
                        news_title = "ESPN"

                    #print("article_links",len(article_links))
                    for link in article_links:
                        if links == "https://www.livemint.com/":
                            link1 = link.get("data-weburl")
                        else:
                            link1 = link.get('href')
                        if link1 == None:
                            continue
                        elif len(link1) < len(
                                links
                        ) + 5 or link1 == None or "/subscribe/" in link or "/login/" in link or "/register/" in link or "/sign-in/" in link or "/www.twitter.com" in link or "/www.facebook.com" in link or "/www.google.com" in link or "/plus.google.com" in link:
                            continue
                        elif "https://www.foxnews.com//www.foxnews.com/" in link1:
                            link1 = link1.replace(
                                "https://www.foxnews.com//www.foxnews.com/",
                                "https://www.foxnews.com/")
                        #link1=link.get('href')
                        #print("link1",link1)
                        elif (not "http://" in link1
                              and not "https://" in link1
                              ) or "https://www.hindustantimes.com/" in link1:
                            if "/" == link1[1:]:
                                link1 = link1[1:]
                                link2 = links + link1
                                url = link2
                                print("/ in ", url)
                            else:
                                if links in link1:
                                    link2 = link1
                                else:
                                    link2 = links + link1
                                #print(link2)
                                print("link2", link2)
                                url = link2.replace("//", '/').replace(
                                    "http:/",
                                    'http://').replace("https:/", 'https://')
                                print("/ not in ", url)

                        else:
                            url = link1
                        if url in pd1 or url in pd2:
                            continue
                        else:
                            try:
                                pd1.append(url)
                                article = Article(url)
                                article.download()
                            except:
                                continue
                            article.html
                            try:
                                article.parse()
                            except:
                                continue

                            today = datetime.date.today()
                            dow_time = datetime.datetime.now().time()
                            auther = article.authors
                            #print("article writer",auther)
                            title = article.title
                            title = title.replace(";", ",")
                            title = title.replace("’", " ")
                            print("title", title)
                            if title == None:
                                print("breakssssssssssssssssssss")
                                continue

                            if title.find("^Facebook$") == -1 or title.find(
                                    "^reddit.com:$") == -1 or title.find(
                                        "^linkedin$") == -1 or title.find(
                                            "^Twitter$") == -1:
                                titles = title
                            else:
                                print("breakssssssssssssssssssss")
                                continue

                            print("article title : ", titles)
                            titles = titles.replace("‘", " ")
                            titles = titles.replace("-", " ")
                            titles = titles.replace("“", " ")
                            titles = titles.replace("”", " ")
                            text = article.text
                            texts = text.replace(";", ",")
                            texts = texts.replace("’", " ")
                            texts = texts.replace("‘", " ")
                            texts = texts.replace("-", " ")
                            texts = texts.replace("“", " ")
                            texts = texts.replace("”", " ")
                            #print("article content : ",text)
                            image_url = article.top_image
                            image_url = image_url.replace(";", ",")
                            image_url = image_url.replace("’", " ")
                            #print("article image link: ",image_url)
                            down_Date = today
                            #print("article download date :", down_Date)
                            d = articleDateExtractor.extractArticlePublishedDate(
                                url)
                            publish_date = d
                            #print("Publish date",publish_date)
                            try:
                                publish_date1 = [
                                    publish_date.day, publish_date.month,
                                    publish_date.year
                                ]
                                publish_date1 = str(publish_date1)
                                publish_date1 = publish_date1.replace(",", "-")
                                publish_date1 = publish_date1.replace("[", " ")
                                publish_date1 = publish_date1.replace("]", " ")
                                publish_time1 = [
                                    publish_date.hour, publish_date.minute
                                ]
                                publish_time1 = str(publish_time1)
                                publish_time1 = publish_time1.replace(",", ":")
                                publish_time1 = publish_time1.replace("[", " ")
                                publish_time1 = publish_time1.replace("]", " ")
                            except:
                                publish_date1 = ""
                                publish_time1 = ""
                            filename = (f'NewsArticle{ct}')
                            filename = filename.replace(":", "+")
                            filename = filename.replace(" ", "")
                            filename = filename + '.csv'

                            #with open(f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{mon}\\{day}\\{filename}', 'a') as csv_file:
                            with open(
                                    f'/home/admin-pc/Desktop/Article/Scrapy/{mon}/{day}/NewsArticle{ct}.csv',
                                    'a') as csv_file:
                                writer = csv.writer(csv_file)
                                if i == 0:
                                    writer.writerow([
                                        "Headline", "Name of Site",
                                        "Article URL", "Article Text",
                                        "Image URL", "Download date",
                                        "Download Time",
                                        "News Date(DD/MM/YYYY)",
                                        "News Time(HH:MM)"
                                    ])
                                else:
                                    wri = writer.writerow([
                                        str(titles.encode("utf-8"))[2:-1],
                                        str(news_title.encode("utf-8"))[2:-1],
                                        str(url.encode("utf-8"))[2:-1],
                                        str(texts.encode("utf-8"))[2:-1],
                                        str(image_url.encode("utf-8"))[2:-1],
                                        down_Date, dow_time, publish_date1,
                                        publish_time1
                                    ])
                                i = i + 1
                else:
                    continue

            else:
                continue

    #return render(request, 'index.html',{})

    #sites_values=request.POST['Sites']
# if request.method == 'POST':
#     recommendations=request.POST.getlist("Sites")
#     print("recommendations",recommendations)
# return render(request, 'index.html')
    return render(request, 'index.html', {})
예제 #20
0
def scrapy():
    pd1 = [
        [],
    ]
    pd2 = []
    now = datetime.datetime.now()
    x = datetime.datetime.now()
    x1 = now.year
    today = datetime.date.today()
    mon1 = x.strftime("%b")
    mon = mon1 + str(x1)
    day1 = now.day
    d = day1
    day = str(day1) + mon1
    first = today.replace(day=1)
    lastmonth = first - datetime.timedelta(days=1)
    lastmonth = lastmonth.strftime("%b")
    lastmonths = lastmonth + str(x1)
    yesderdate = datetime.datetime.strftime(x - timedelta(1), '%d')
    yesderdates = str(yesderdate) + lastmonth
    yesda = str(yesderdate) + mon1
    ct = strftime("%I:%M %p")
    existsm = os.path.exists(
        f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{mon}')
    if existsm:
        pass
    else:
        os.mkdir(
            f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{mon}')

    existsm = os.path.exists(
        f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{mon}\\{day}')
    if existsm:
        pass
    else:
        os.mkdir(
            f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{mon}\\{day}'
        )

    try:
        for filename in os.listdir(
                f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{mon}\\{day}\\'
        ):
            if existsm:
                if filename.endswith(".csv"):
                    co = pd.read_csv(
                        f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{mon}\\{day}\\{filename}',
                        delimiter=',').values.tolist()
                    #print("Current File",co)
                    pd1 = pd1 + co
            else:
                pass

    except:
        pass
    if day1 == "1" or day1 == 1:

        for filename in os.listdir(
                f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{lastmonths}\\{yesderdates}\\'
        ):

            if filename.endswith(".csv"):
                co1 = pd.read_csv(
                    f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{lastmonths}\\{yesderdates}\\{filename}',
                    delimiter=',').values.tolist()
                pd1 = pd1 + co1
    else:
        try:
            for filename in os.listdir(
                    f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{mon}\\{yesda}\\'
            ):
                if filename.endswith(".csv"):
                    co1 = pd.read_csv(
                        f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{mon}\\{yesda}\\{filename}',
                        delimiter=',').values.tolist()
                    pd1 = pd1 + co1
                    #print("Old File",co1)
        except:
            pass
    if pd1 != [] or pd1 != "":
        for j in pd1:
            for k in j:
                j1 = pd2.append(k)
    i = 0
    list = [
        "https://timesofindia.indiatimes.com/", "https://aninews.in",
        "https://indiatoday.in"
    ]
    list = [
        "https://inshorts.com/en/read",
    ]
    for links in list:
        if links != None or links != " ":
            response = requests.get(links)
            data = response.text
            soup = BeautifulSoup(data, "html.parser")  #.encode("utf-8")
            if soup != None or soup != " ":
                if links == "https://inshorts.com/en/read":
                    article_links = soup.findAll(
                        'a', attrs={'href': re.compile("/articleshow/")})
                    news_title = "In Shorts"
                else:
                    continue
                # if links == "https://timesofindia.indiatimes.com/":
                # 	article_links = soup.findAll('a', attrs={'href': re.compile("/articleshow/")})
                # 	news_title="The Times of India"
                # elif links == "https://aninews.in":
                # 	article_links = soup.findAll('a', attrs={'href': re.compile("/news/")})
                # 	news_title = "ANI NEWS"
                # elif links == "https://indiatoday.in" :
                # 	article_links = soup.findAll('a', attrs={'href': re.compile("/story/")})
                # 	news_title="India Today"
                #print("article_links",len(article_links))
                for link in article_links:
                    link1 = link.get('href')
                    if not "http://" in link1 and not "https://" in link1:
                        if "/" == link1[1:]:
                            link1 = link1[1:]
                            link2 = links + link1
                            #print(link2)
                            url = link2
                            print("url", url)
                            continue
                        else:
                            link2 = links + link1
                            #print(link2)
                            url = link2
                            print("url", url)
                            continue
                            if url in pd1 or url in pd2:
                                continue
                            else:
                                try:
                                    pd1.append(url)
                                    article = Article(url)
                                    article.download()
                                except:
                                    continue
                                article.html
                                article.parse()
                                today = datetime.date.today()
                                dow_time = datetime.datetime.now().time()
                                auther = article.authors
                                #print("article writer",auther)
                                title = article.title
                                titles = title.replace(";", ",")
                                #print("article title : ",title)
                                text = article.text
                                texts = text.replace(";", ",")
                                #print("article content : ",text)
                                image_url = article.top_image
                                #print("article image link: ",image_url)
                                down_Date = today
                                #print("article download date :", down_Date)
                                d = articleDateExtractor.extractArticlePublishedDate(
                                    url)
                                publish_date = d
                                #print("Publish date",publish_date)
                                try:
                                    publish_date1 = [
                                        publish_date.day, publish_date.month,
                                        publish_date.year
                                    ]
                                    publish_date1 = str(publish_date1)
                                    publish_date1 = publish_date1.replace(
                                        ",", "-")
                                    publish_date1 = publish_date1.replace(
                                        "[", " ")
                                    publish_date1 = publish_date1.replace(
                                        "]", " ")
                                    publish_time1 = [
                                        publish_date.hour, publish_date.minute
                                    ]
                                    publish_time1 = str(publish_time1)
                                    publish_time1 = publish_time1.replace(
                                        ",", ":")
                                    publish_time1 = publish_time1.replace(
                                        "[", " ")
                                    publish_time1 = publish_time1.replace(
                                        "]", " ")
                                except:
                                    publish_date1 = ""
                                    publish_time1 = ""
                                filename = (f'NewsArticle{ct}')
                                filename = filename.replace(":", "+")
                                filename = filename.replace(" ", "")
                                filename = filename + '.csv'

                                with open(
                                        f'C:\\Users\\ankit\\Desktop\\ArticleProject\\Excel_File\\{mon}\\{day}\\{filename}',
                                        'a') as csv_file:
                                    writer = csv.writer(csv_file)
                                    if i == 0:
                                        writer.writerow([
                                            "Headline", "Name of Site",
                                            "Article URL", "Article Text",
                                            "Image URL", "Download date",
                                            "Download Time",
                                            "News Date(DD/MM/YYYY)",
                                            "News Time(HH:MM)"
                                        ])
                                    else:
                                        wri = writer.writerow([
                                            str(titles.encode("utf-8"))[2:-1],
                                            str(news_title.encode(
                                                "utf-8"))[2:-1],
                                            str(url.encode("utf-8"))[2:-1],
                                            str(texts.encode("utf-8"))[2:-1],
                                            str(image_url.encode("utf-8"))
                                            [2:-1], down_Date, dow_time,
                                            publish_date1, publish_time1
                                        ])
                                    i = i + 1
            else:
                continue

        else:
            continue
예제 #21
0
    def parse_article(self, response):
        news_id = 19684  #response.meta.get('news_id')

        # save to file
        with open(str(news_id) + '.html', 'wb') as fh:
            fh.write(response.body)
        article = Article(response.url)
        # set html manually
        with open(str(news_id) + '.html', 'rb') as fh:
            article.html = fh.read()
        os.remove(str(news_id) + '.html')
        # need to set download_state to 2 for this to work
        article.download_state = 2
        article.parse()
        article.nlp()
        date = article.publish_date
        keywords = str([x.replace("'", "''")
                        for x in article.keywords]).replace('"', '\'')
        content = article.text.replace("'", "''")
        summary = article.summary.replace("'", "''")
        title = article.title.replace("'", "''")
        if date is None:
            date = 'null'
        else:
            date = "'" + str(date) + "'"
        authors = str([x.replace("'", "''")
                       for x in article.authors]).replace('"', '\'')
        tags = str([x.replace("'", "''")
                    for x in article.meta_keywords]).replace('"', '\'')

        dbconnector.execute(
            self.conn,
            'INSERT INTO "ParsedNews-newspaper"("IDNews", "Date", "Content", "Keywords", '
            + '"Summary", "Authors", "Tags", "Title") ' + 'VALUES (' +
            str(news_id) + ', ' + str(date) + ', \'' + content + '\', ARRAY ' +
            str(keywords) + '::text[], \'' + summary + '\', ARRAY ' +
            str(authors) + '::text[], ARRAY ' + str(tags) + '::text[], \'' +
            title + '\')')

        # get main article without comments
        content = extract_content(response.text).replace("'", "''")

        # get article and comments
        content_comments = '[\'' + extract_content_and_comments(
            response.text).replace("'", "''") + '\']'

        dbconnector.execute(
            self.conn,
            'INSERT INTO "ParsedNews-dragnet"("IDNews", "Content", "Comments") '
            + 'VALUES (' + str(news_id) + ', \'' + content + '\', ARRAY ' +
            str(content_comments) + '::text[])')

        date = articleDateExtractor.extractArticlePublishedDate(
            articleLink=response.url, html=response.text)
        if date is not None:
            dbconnector.execute(
                self.conn, 'INSERT INTO "ParsedNews-ade"("IDNews", "Date") ' +
                'VALUES (' + str(news_id) + ', \'' + str(date) + '\')')

        g = Goose()
        article = g.extract(raw_html=response.text)
        date = article.publish_datetime_utc
        keywords = str([x.replace("'", "''")
                        for x in article.tags]).replace('"', '\'')
        content = article.cleaned_text.replace("'", "''")
        summary = article.meta_description.replace("'", "''")
        title = article.title.replace("'", "''")
        if date is None:
            date = 'null'
        else:
            date = "'" + str(date) + "'"
        authors = str([x.replace("'", "''")
                       for x in article.authors]).replace('"', '\'')
        tags = str([
            x.replace("'", "''") for x in article.meta_keywords.split(",")
        ]).replace('"', '\'')
        tweets = str([x.replace("'", "''")
                      for x in article.tweets]).replace('"', '\'')

        dbconnector.execute(
            self.conn, 'INSERT INTO "ParsedNews-goose"(' +
            '"IDNews", "Date", "Content", "Keywords", "Summary", ' +
            '"Authors", "Tags", "Tweets",' + '"Title") VALUES (' +
            str(news_id) + ', ' + date + ', \'' + content + '\', ARRAY ' +
            str(keywords) + '::text[], \'' + str(summary) + '\', ARRAY ' +
            str(authors) + '::text[], ARRAY ' + str(tags) +
            '::text[], ARRAY ' + str(tweets) + '::text[], \'' + str(title) +
            '\')')

        pass
예제 #22
0
import articleDateExtractor
website = "https://edition.cnn.com/2019/07/21/europe/bulgaria-hack-tax-intl/index.html"
d = articleDateExtractor.extractArticlePublishedDate(website)

print (d)
예제 #23
0
cursor = mariadb_connection.cursor()

#retrieving information

cursor.execute("SELECT url FROM skcript")
data = cursor.fetchall()
for text in data:
    try:
        url = text[0]
        article = Article(url)
        article.download()
        article.parse()
        try:
            cursor.execute(
                "UPDATE skcript set author={!a},charCount='{:d}',title={!a} where url='{!s}'"
                .format("".join(article.authors), len(article.text),
                        article.title, url))
        except mariadb.Error as error:
            print("Error: {}".format(error))
        d = articleDateExtractor.extractArticlePublishedDate(url)
        try:
            cursor.execute("UPDATE skcript set date='{:%Y-%m-%d}'".format(d))
        except (TypeError, mariadb.Error):
            print("date error")
    except ArticleException:
        continue

mariadb_connection.commit()

mariadb_connection.close()