Exemple #1
0
 def updateCacheNewUrls(self, newLi, origin = None):
     count = 0
     notUrl = 0
     external = 0
     tempScraper = SCRAPER(file_objects)##To access breakdownURL function
     for url in newLi:
         if url[-1] == "/":
             url = url[0:(len(url)-1)]
         try:
             tempScraper.breakDownURL(url)
             try:
                 if tempScraper.domain not in self.scraper.term:
                     external += 1
             except AttributeError:
                 pass
             try:
                 self.db.cur.execute("INSERT INTO cache (url, protocol, domain, status) VALUES (%s, %s, %s, %s);",
                                      (url,
                                       tempScraper.protocol,
                                       tempScraper.domain,
                                       'todo',))
                 count+=1
             except psycopg2.IntegrityError:
                 pass
         except ValueError:
             notUrl+=1
     if origin is not None:
         with open("SQL_QUERIES/insertIntoLinkTrack.txt") as fi:
             query = fi.read().strip()
         self.db.cur.execute(query,
                             (origin, len(newLi), count, notUrl, external,))
     print("Saved {0} of {1} total viable links".format(count, len(newLi)-notUrl))
Exemple #2
0
    def __init__(self, configFiName, filterToSeed = False):
        self.configFiName = configFiName
        self.filterToSeed = filterToSeed##Limit queries to urls with domains in seed list
        self.readConfig()
        
        #self.log = LOG(self.createFiName(self.config['path_to_log'], 'log'))
        self.log = LOG('temp_log/test.csv')

        self.scraper = SCRAPER(file_objects)

        with open(self.config['path_to_db_pass']) as fi:
            dbPass = fi.read().strip()
    
        self.db = DBCONN(self.config['database'],
                         self.config['db_host'],
                         self.config['db_user'],
                         dbPass)
        self.seedLi = self.readSeedList()
        if self.db.conn is None:
            raise ValueError("The database does not exist: {0}".format(self.config['databae']))
        try:
            self.db.cur.execute("SELECT * FROM cache;")

        except psycopg2.ProgrammingError as err:
            if 'relation "cache" does not exist' in str(err):
                self.createTableCache()
                self.createTableLinkTrack()
                self.updateCacheNewUrls(self.seedLi)
            else:
                raise ValueError("psycopg2 error: {0}".format(str(err)))
Exemple #3
0
class SCRAPING_INSTANCE:
    def __init__(self, configFiName, filterToSeed = False):
        self.configFiName = configFiName
        self.filterToSeed = filterToSeed##Limit queries to urls with domains in seed list
        self.readConfig()
        
        #self.log = LOG(self.createFiName(self.config['path_to_log'], 'log'))
        self.log = LOG('temp_log/test.csv')

        self.scraper = SCRAPER(file_objects)

        with open(self.config['path_to_db_pass']) as fi:
            dbPass = fi.read().strip()
    
        self.db = DBCONN(self.config['database'],
                         self.config['db_host'],
                         self.config['db_user'],
                         dbPass)
        self.seedLi = self.readSeedList()
        if self.db.conn is None:
            raise ValueError("The database does not exist: {0}".format(self.config['databae']))
        try:
            self.db.cur.execute("SELECT * FROM cache;")

        except psycopg2.ProgrammingError as err:
            if 'relation "cache" does not exist' in str(err):
                self.createTableCache()
                self.createTableLinkTrack()
                self.updateCacheNewUrls(self.seedLi)
            else:
                raise ValueError("psycopg2 error: {0}".format(str(err)))
        
        
        
    def createFiName(self, path, prefix):
        timing = datetime.datetime.now()
        timing_formatted = timing.strftime("%Y%m%d-%H%M")
        finame = "{0}/{1}_{2}.csv".format(path,
                                         prefix,
                                         timing_formatted)
        return finame

    def checkPath(self, path):
        if path.endswith("/") or path.endswith("\\"):
            path = path[0:(len(path)-1)]
        return path

    def readConfig(self):
        try:
            with open(self.configFiName) as fi:
                self.config = json.load(fi)
        except FileNotFoundError:
            self.config = {}
            self.config['database'] = input("Name of database: ")
            self.config['path_to_db_pass'] = input("Path (w/ filename) to db password: "******"DB User: "******"DB host: ")
            self.config['path_to_file_save'] = self.checkPath(input("Path to save dump files: "))
            self.config['file_save_name'] = '0'
            self.config['seed_list'] = input('Seed list: ')
            self.config['path_to_log'] = self.checkPath(input("Path to log files: "))
            self.writeConfig()

    def writeConfig(self):
        with open(self.configFiName, 'w') as fi:
            fi.write(json.dumps(self.config))
            
    def createTableCache(self):
        with open("SQL_QUERIES/createCache.txt") as fi:
            query = fi.read().strip()
        self.db.cur.execute(query)

    def createTableLinkTrack(self):
        with open("SQL_QUERIES/createLinkTrack.txt") as fi:
            query = fi.read().strip()
        self.db.cur.execute(query)

    def updateCacheNewUrls(self, newLi, origin = None):
        count = 0
        notUrl = 0
        external = 0
        tempScraper = SCRAPER(file_objects)##To access breakdownURL function
        for url in newLi:
            if url[-1] == "/":
                url = url[0:(len(url)-1)]
            try:
                tempScraper.breakDownURL(url)
                try:
                    if tempScraper.domain not in self.scraper.term:
                        external += 1
                except AttributeError:
                    pass
                try:
                    self.db.cur.execute("INSERT INTO cache (url, protocol, domain, status) VALUES (%s, %s, %s, %s);",
                                         (url,
                                          tempScraper.protocol,
                                          tempScraper.domain,
                                          'todo',))
                    count+=1
                except psycopg2.IntegrityError:
                    pass
            except ValueError:
                notUrl+=1
        if origin is not None:
            with open("SQL_QUERIES/insertIntoLinkTrack.txt") as fi:
                query = fi.read().strip()
            self.db.cur.execute(query,
                                (origin, len(newLi), count, notUrl, external,))
        print("Saved {0} of {1} total viable links".format(count, len(newLi)-notUrl))

    def updateCacheDownload(self, ident):
        with open('SQL_QUERIES/updateCache_download.txt') as fi:
            query = fi.read().strip()
        content_type = tryAndNone(self.scraper.info, 'Content-Type')
        if content_type == None:##Noticed an error with capitalization in some server responses
            content_type = tryAndNone(self.scraper.info, 'Content-type')
        try:
            self.db.cur.execute(
                query,
                (datetime.datetime.now(),##Datetime of download
                 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',##Should be dynamic
                 self.scraper.term,##url_returned
                 self.scraper.protocol,##protocol_returned
                 self.scraper.domain,##domain returned
                 tryAndNone(self.scraper.info, 'Server'),##server returned
                 tryAndNone(self.scraper.info, 'Date'), ##Date on server...in scraper.info dict
                 content_type, ##contenttype
                 tryAndNone(self.scraper.info, 'Connection'), ##connection
                 self.scraper.info_str,##Full response
                 str(self.scraper.soup),##Full string as text
                 str(self.scraper.soup),##Full string to md5 hash
                 str(self.scraper.body_clean),##body_clean
                 ident,))
        except psycopg2.IntegrityError:
            self.db.cur.execute(
                "UPDATE cache SET status = 'ignore', status_msg = 'DUPLICATE' WHERE id = %s;",
                (ident,)
                )

    def updateCacheDownload_Fail(self, record_id, reason = "Unknown"):
        if 'invalid record' in reason:
            self.db.cur.execute("UPDATE cache SET status = 'failed', status_msg = %s WHERE id = %s;",
                                (reason, record_id,))
        elif 'This item is a file object' in reason:
            self.db.cur.execute("UPDATE cache SET status = 'ignore', status_msg = %s WHERE id = %s;",
                                (reason, record_id))
        elif 'Ignored website' == reason:
            self.db.cur.execute("UPDATE cache SET status = 'ignore', status_msg = %s WHERE id = %s;",
                                (reason, record_id,))
        elif 'Failed to goOnline' == reason:
            self.db.cur.execute("UPDATE cache SET status = 'failed', status_msg = %s WHERE id = %s;",
                                (reason, record_id,))
        elif 'Error making URL request' == reason:
            self.db.cur.execute("UPDATE cache SET status = 'failed', status_msg = %s WHERE id = %s;",
                                (reason, record_id,))
        elif 'Could not access page' == reason:
            self.db.cur.execute("UPDATE cache SET status = 'failed', status_msg = %s WHERE id = %s;",
                                (self.scraper.ErrorMessage, record_id,))
        elif 'Parse error' == reason:
            with open('SQL_QUERIES/updateCache_download.txt') as fi:
                query = fi.read().strip()
            content_type = tryAndNone(self.scraper.info, 'Content-Type')
            if content_type == None:##Noticed an error with capitalization in some server responses
                content_type = tryAndNone(self.scraper.info, 'Content-type')
            self.db.cur.execute(query,
                                (reason,##status_msg
                                 datetime.datetime.now(),##Datetime of download
                                 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',##Should be dynamic
                                 self.scraper.term,##url_returned
                                 self.scraper.protocol,##protocol_returned
                                 self.scraper.domain,##domain returned
                                 tryAndNone(self.scraper.info, 'Server'),##server returned
                                 tryAndNone(self.scraper.info, 'Date'), ##Date on server...in scraper.info dict
                                 content_type, ##contenttype
                                 tryAndNone(self.scraper.info, 'Connection'), ##connection
                                 self.scraper.info_str,##Full response
                                 record_id,))
        else:
            print(self.scraper.term)
            print(self.scraper.info)
            self.db.cur.execute("UPDATE cache SET status = 'failed', status_msg = 'UNKNOWN REASON' WHERE id = %s;",
                                (record_id,))
            #print(self.scraper.info)
            #raise AttributeError(
            #    "REASON FOR DOWNLOAD FAIL NOT VALID: {0}".format(reason))

    def readSeedList(self):
        seedList = []
        with open(self.config['seed_list']) as fi:
            reader = csv.reader(fi)
            for i, row in enumerate(reader):
                if i == 0:
                    foundUrlIndex = False
                    for k, col in enumerate(row):
                        if col == 'url':
                            foundUrlIndex = True
                            break
                    if foundUrlIndex == False:
                        raise ValueError("Your seed list file needs to have a column named 'url'.")
                else:
                    seedList.append(row[k])
        return seedList

    def makeToDoLi(self, low, high):
        self.db.cur.execute("SELECT id, url FROM cache WHERE status='todo' AND (id>%s AND id <%s);",
                            (low, high,))
        self.toDoLi = self.db.cur.fetchall()
        #print(self.toDoLi)

    def checkIgnore(self, url):
        for x in websites_to_ignore.ignore:
            if x in url:
                return False
        return True

    def checkSeedList(self, url):
        for x in self.seedLi:
            if url.startswith(x):
                return True
        return False

    def iterateThroughToDo(self):
        for urlRecord in self.toDoLi:
            try:
                ident = urlRecord[0]
                url = urlRecord[1]
                goOn = False
                if self.filterToSeed == True:
                    if self.checkSeedList:
                        return True##In Seed List
                    else:
                        return False##Not in seed list
                else:
                    goOn = True##Seed list is irrelevant
                if goOn == True:
                    if self.checkIgnore(url):
                        try:
                            result = self.scraper.goOnline(url)
                            if result is not None:
                                try:
                                    self.scraper.breakDownURL(url)
                                    self.scraper.parseRequestObj(result)
                                    self.updateCacheDownload(ident)
                                    self.updateCacheNewUrls(
                                        self.scraper.newToDoLi,
                                        origin = ident
                                        )
                                except:
                                    self.updateCacheDownload_Fail(
                                        ident,
                                        'Parse error'
                                        )
                            elif 'This item is a file object' in self.scraper.ErrorMessage:
                                self.updateCacheDownload_Fail(
                                    ident,
                                    self.scraper.ErrorMessage
                                    )
                            else:
                                self.updateCacheDownload_Fail(
                                    ident,
                                    'Could not access page'
                                    )
                        except:
                            self.updateCacheDownload_Fail(
                                ident,
                                'Error making url request'
                                )
                    else:
                        self.updateCacheDownload_Fail(
                            ident,
                            'Ignored website'
                            )
            except IndexError:
                self.updateCacheDownload_Fail(
                    ident,
                    'Invalid record in database {0}'.format(str(urlRecord)))