Esempio n. 1
0
class Classifier:
    def __init__(self):
        self.bag_of_words = [
            'modi', 'pm', 'visit', 'narendra', 'prime minister'
        ]
        self.db_con = DBConnection().create_connection()
        self.cursor = self.db_con.cursor()

    def classify_title(self):
        sql = "SELECT id, title FROM news_url WHERE id > %s ORDER BY id ASC "
        sql_update = "UPDATE news_url SET is_valid = True WHERE id = %s"

        news_url_id = self.get_last_news_url_id()
        counter = 0
        while True:
            self.cursor.execute(sql, (news_url_id, ))
            rs_tuple = self.cursor.fetchone()
            if not rs_tuple:
                break

            (news_url_id, title) = rs_tuple
            print(news_url_id)

            for key_word in self.bag_of_words:
                if re.search(key_word, title.strip(), re.I):
                    self.cursor.execute(sql_update, (news_url_id, ))
                    self.db_con.commit()
            counter += 1
            if counter % 1000 == 0:
                print('Sleeping for 5 seconds')
                time.sleep(5)

            self.update_last_news_url_id(news_url_id)

    def update_last_news_url_id(self, news_url_id):
        sql = "UPDATE scraper_info SET news_url_id = %s WHERE id = 1"
        self.cursor.execute(sql, (news_url_id, ))
        self.db_con.commit()

    def get_last_news_url_id(self):
        sql = "SELECT news_url_id FROM scraper_info"
        self.cursor.execute(sql)
        rs_tuple = self.cursor.fetchone()
        (news_url_id, ) = rs_tuple
        return news_url_id
Esempio n. 2
0
class Classifier:
    def __init__(self):
        self.bag_of_words = [
            'modi', 'pm', 'visit', 'narendra', 'prime minister'
        ]
        self.bag_of_words_set = set(self.bag_of_words)
        self.db_con = DBConnection().create_connection()
        self.cursor = self.db_con.cursor()

    def classify_title(self):
        sql = "SELECT id, title FROM news_url WHERE id > %s ORDER BY id ASC "
        sql_update = "UPDATE news_url SET is_valid = True WHERE id = %s"

        news_url_id = self.get_last_news_url_id()
        counter = 0

        while True:
            self.cursor.execute(sql, (news_url_id, ))
            rs_tuple = self.cursor.fetchone()
            if not rs_tuple:
                break

            (news_url_id, title) = rs_tuple
            print(news_url_id)
            stemmer = PorterStemmer()
            result_set = None
            title = title.lower()
            title_set = set([stemmer.stem(item) for item in title.split()])
            #print (title)
            result_set = (title_set & self.bag_of_words_set)
            print("result_set", result_set)
            '''if result_set:
                self.cursor.execute(sql_update, (news_url_id, ))
                self.db_con.commit()'''

            #for item in title_set:

            #    if item.lower() in COUNTRY:
            #       # print(item)
            #        if result_set:
            #            self.cursor.execute(sql_update, (news_url_id, ))
            #            self.db_con.commit()
            #    else:
            #        if self.get_place(item) and result_set:
            #           self.cursor.execute(sql_update, (news_url_id, ))
            #            self.db_con.commit()

            counter += 1
            if counter % 1000 == 0:
                print('Sleeping for 5 seconds')
                time.sleep(5)

            #self.update_last_news_url_id(news_url_id)

    def update_last_news_url_id(self, news_url_id):
        sql = "UPDATE scraper_info SET news_url_id = %s WHERE id = 1"
        self.cursor.execute(sql, (news_url_id, ))
        self.db_con.commit()

    def get_last_news_url_id(self):
        sql = "SELECT news_url_id FROM scraper_info"
        self.cursor.execute(sql)
        rs_tuple = self.cursor.fetchone()
        (news_url_id, ) = rs_tuple
        return news_url_id

    def get_place(self, text):
        chunked = ne_chunk(pos_tag(word_tokenize(text)))

        for i in chunked:
            if type(i) == Tree:
                if i.label() == "GPE":
                    for token, pos in i.leaves():
                        #print("found country %s " %token)
                        return True

        return False
Esempio n. 3
0
class Xtractor:
    def __init__(self):
        self.year = 2015
        self.base_url = "http://timesofindia.indiatimes.com/%s"
        self.url_to_scrape = None
        self.db_con = DBConnection().create_connection()
        self.cursor = self.db_con.cursor()
        self.request_headers = {
            "User-Agent":
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"
        }

    def save_last_month_date_url(self, month_date_url_id):
        sql_scraper_info = "UPDATE scraper_info SET last_month_date_url_id = %s WHERE id=1"
        self.cursor.execute(sql_scraper_info, (month_date_url_id, ))
        self.db_con.commit()

    def get_url_to_scrape(self):
        month_date_url_id = 0
        sql_last_month_date_url = "SELECT last_month_date_url_id FROM scraper_info WHERE id=1"
        sql_get_month_date_url = """SELECT month_date_url ,id
                                    FROM month_date_url 
                                    %s """
        self.cursor = self.db_con.cursor()
        self.cursor.execute(sql_last_month_date_url)
        rs_tuple = self.cursor.fetchone()

        (month_date_url_id, ) = rs_tuple

        if rs_tuple:
            (month_date_url_id, ) = rs_tuple
            condition = 'WHERE id > %s ORDER BY id asc' % (month_date_url_id)
            self.cursor.execute(sql_get_month_date_url % condition)
            rs_tuple = self.cursor.fetchone()

            if not rs_tuple:
                raise NoMoreUrlFoundException(
                    "No URL left to scrape...you are done scraping all the urls present in DB"
                )
            (month_date_url, month_date_url_id) = rs_tuple
        else:
            self.cursor.execute(sql_get_month_date_url % ('ORDER BY id asc'))
            rs_tuple = self.cursor.fetchone()
            (month_date_url, month_date_url_id) = rs_tuple

        return (month_date_url_id, month_date_url)

    def start(self):
        while True:
            try:
                time.sleep(5)
                (month_date_url_id, month_date_url) = self.get_url_to_scrape()
                print(month_date_url)
                url_to_open = self.base_url % month_date_url
                print(url_to_open)
                html = requests.get(url_to_open,
                                    headers=self.request_headers).text
                self.parse_titles(html, month_date_url_id)
            except NoMoreUrlFoundException:
                print("------------------DONE------------------------")
                sys.exit(0)
        return

    def save_tile_and_href(self, month_date_url_id, title, href):
        sql = "INSERT INTO news_url (month_date_url_id, title, url) VALUES (%s, %s, %s) ON CONFLICT (title) DO NOTHING"
        self.cursor.execute(sql, (month_date_url_id, title, href))
        self.db_con.commit()
        return

    def parse_titles(self, html, month_date_url_id):
        soup = BeautifulSoup(html, 'lxml')
        div = soup.find(
            'div', {
                'style':
                'font-family:arial ;font-size:12;font-weight:bold; color: #006699'
            })
        table = div.find('table')
        atag_list = table.find_all('a')

        with open("/home/aish/Desktop/toi/toi.html", 'w') as f:
            f.write(str(table))

        for a in atag_list:
            title = a.text.strip()
            href = a['href']
            self.save_tile_and_href(month_date_url_id, title, href)

        self.save_last_month_date_url(month_date_url_id)
        return

    def initiate(self):
        sql_insert_month_date_url = "INSERT INTO month_date_url (url_month, url_date, year, month_date_url) VALUES (%s, %s, %s, %s)"
        toi_starttime = 42005
        month_counter = 1
        date_counter = 1

        # Add all url of all month date to database
        while True:
            # Get number of days in a month depending on the year
            num_of_days_in_month = calendar.monthrange(self.year,
                                                       month_counter)[1]

            while True:
                toi_url_month_date = "/%s/1/1/archivelist/year-%s,month-%s,starttime-%s.cms" % (
                    self.year, self.year, month_counter, toi_starttime)
                self.cursor.execute(sql_insert_month_date_url,
                                    (month_counter, date_counter, self.year,
                                     toi_url_month_date))
                self.db_con.commit()

                toi_starttime += 1
                date_counter += 1
                print(toi_url_month_date)

                if date_counter > num_of_days_in_month:
                    # Reset the date counter to 1 before break for each month
                    date_counter = 1
                    break

            month_counter += 1

            # Break when December is reached
            if month_counter > 12:
                break
    '''
    #Stage 4
    # keyword specific seach
    if re.search("arrives|attend|embarks|reaches", content,re.DOTALL):
        filter_count = filter_count + 1
        cursor.execute(sql, (filter_count, id))
        print (">>>>>>>>>>>>>>>>>",id)
    '''
#dbcon.commit()
requests
sql_news_title = "SELECT n.title, n.id, n.url FROM news_url n INNER JOIN classified_article c ON c.news_url_id = n.id WHERE c.counter > 3 "
sql_insert = "INSERT INTO visit_info(place, visit_date) values(%s, %s) ON CONFLICT (place, visit_date) DO NOTHING"
cursor.execute(sql_news_title)
rs_tuple_list = cursor.fetchall()
for rs_tuple in rs_tuple_list:
    (title, id, url) = rs_tuple
    lst_ner = get_continuous_chunks(title)
    country_list = []
    for item in lst_ner:
        if re.search(regex, item.strip().lower()):
            country_list.append(item)
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'lxml')
    if country_list:
        for country in country_list:
            date = soup.find('span', {'class': "time_cptn"}).text
            date_reg = re.search(r'(.*\|)(.*2015)(.*)', date)
            date_str = date_reg.group(2)
            cursor.execute(sql_insert, (country, date_str))
            dbcon.commit()