def setDateRangeTo(self, daterangeTo=None, printing=False): if daterangeTo is not None: newDaterangeTo = self._convertToValidJulianDate( daterangeTo, printing) if newDaterangeTo != -1: if self.daterangeFrom is not None: if newDaterangeTo >= self.daterangeFrom: self.daterangeTo = newDaterangeTo ## update config self.config['daterangeTo'] = newDaterangeTo self.config['daterange_to'] = newDaterangeTo return True else: print_error( printing, self.__class__.__name__, sys._getframe().f_code.co_name, "daterangeTo cannot be less than daterangeFrom.") return False else: self.daterangeTo = newDaterangeTo ## update config self.config['daterangeTo'] = newDaterangeTo self.config['daterange_to'] = newDaterangeTo return True return False
def setDateRangeFrom(self, daterangeFrom=None, printing=False): if daterangeFrom is not None: newDaterangeFrom = self._convertToValidJulianDate( daterangeFrom, printing) if newDaterangeFrom != -1: if self.daterangeTo is not None: if newDaterangeFrom <= self.daterangeTo: self.daterangeFrom = newDaterangeFrom ## update config self.config['daterangeFrom'] = newDaterangeFrom self.config['daterange_from'] = newDaterangeFrom return True else: print_error( printing, self.__class__.__name__, sys._getframe().f_code.co_name, "daterangeFrom cannot be greater than daterangeTo." ) return False else: self.daterangeFrom = newDaterangeFrom ## update config self.config['daterangeFrom'] = newDaterangeFrom self.config['daterange_from'] = newDaterangeFrom return True return False
def DBSetup(self, dbTableName, printing): """ Ensures a table with the required name exists in the database. Args: dbTableName: the table name to be used in the database. printing: if we should print to terminal or not. Returns: True or False. """ ## Assumes self.conn is not None, throws an Exception if it is. try: self.conn.execute('''CREATE TABLE IF NOT EXISTS `%s`( resultNumberInSearch INTEGER, SearchEngines TEXT, Topic TEXT NOT NULL, URL TEXT NOT NULL, ResultPageNumber INTEGER NOT NULL, ResultNumberOnPage INTEGER NOT NULL, StartDate INTEGER, EndDate INTEGER, SearchedOnDate DATE, ObtainedFromQuery TEXT NOT NULL, QueryPageURL TEXT, PRIMARY KEY(SearchEngines, Topic, URL) ); '''%dbTableName) self.conn.commit() return True except Exception, e: print_error(printing, self.__class__.__name__, sys._getframe().f_code.co_name, "could not create table '"+dbTableName+"' in database.", e) return False
def goToNextDateRange(self, newRange=None, printing=False): if self.daterangeFrom is not None and self.daterangeTo is not None: if newRange is None: newRange = self.daterangeTo - self.daterangeFrom self.setDateRange(self.daterangeTo, self.daterangeTo + newRange) return True else: print_error(printing, self.__class__.__name__, sys._getframe().f_code.co_name, "daterangeFrom and daterangeTo are not set.") return False
def goToPreviousDateRange(self, newRange=None, printing=False): if self.daterangeFrom is not None and self.daterangeTo is not None: if newRange is None: newRange = self.daterangeTo - self.daterangeFrom ## the difference = length of time period self.setDateRange(self.daterangeFrom - newRange, self.daterangeFrom) return True else: print_error(printing, self.__class__.__name__, sys._getframe().f_code.co_name, "daterangeFrom and daterangeTo are not set.") return False
def setInTitle(self, intitle=None, printing=False): if intitle is not None: intitle = intitle.strip() if intitle.find("\n") == -1: if intitle.find( " " ) != -1: ## is there's multiple words, surround them with quotes intitle = '"%s"' % intitle self.intitle = intitle ## update config self.config['intitle'] = intitle return True else: print_error( printing, self.__class__.__name__, sys._getframe().f_code.co_name, "the title cannot have newlines in the word, only spaces, hyphens, underscores and periods." ) return False
def setFuzzyTopicsList(self, fuzzyTopicsList=None, printing=False): if fuzzyTopicsList is not None: if fuzzyTopicsList != []: if False in [ type(x) == type("") for x in fuzzyTopicsList ]: ## i.e. if there is any item in the list which is not a string print_error(printing, self.__class__.__name__, sys._getframe().f_code.co_name, "the list cannot contain non-strings.") return False self.fuzzyTopicsList = fuzzyTopicsList ## update config self.config['fuzzyTopicsList'] = fuzzyTopicsList self.config['fuzzy_topics_list'] = fuzzyTopicsList return True else: print_error(printing, self.__class__.__name__, sys._getframe().f_code.co_name, "the list cannot be empty.") return False
def setSiteList(self, siteList=None, printing=False): if siteList is not None: if siteList != []: if False in [ type(x) == type("") for x in siteList ]: ## i.e. if there is any item in the list which is not a string print_error(printing, self.__class__.__name__, sys._getframe().f_code.co_name, "the site list cannot contain non-strings.") return False elif False in [x.find(" ") == -1 for x in siteList]: print_error( printing, self.__class__.__name__, sys._getframe().f_code.co_name, "websites in the site list cannot contain a space in their url." ) return False else: self.siteList = siteList ## update config self.config['siteList'] = siteList self.config['site_list'] = siteList return True else: print_error(printing, self.__class__.__name__, sys._getframe().f_code.co_name, "the list cannot be empty.") return False
def _convertToValidJulianDate(self, daterangeDate, printing=False): """This function converts daterange values to the appropriate Julian date integer. The dateranges are allowed to be entered as datetime.datetime objects, datetime.date objects, or integers which are assumed to be the julian date (cannot be smaller than start of UNIX time i.e. 1 Jan 1970).""" if daterangeDate is not None: if type(daterangeDate) == type(datetime.datetime.now().date( )) or type(daterangeDate) == type(datetime.datetime.now( )): ## works on both datetime.datetime and datetime.date objects. return self._toJulianDateDatetime(daterangeDate) elif type(daterangeDate) == type( 0): ## if it is an integer, assumed to be julian date. if daterangeDate >= 2440588: ## start of UNIX time, i.e. 1 Jan 1970. Not 4 Sept 1998 (i.e. date of founding of Google as a company) because Google has pages from before it was created. return daterangeDate else: print_error( printing, self.__class__.__name__, sys._getframe().f_code.co_name, "daterangeDate has invalid value of %s, must not be before start of UNIX time i.e. 1 Jan 1970." % (daterangeDate)) else: print_error( printing, self.__class__.__name__, sys._getframe().f_code.co_name, "daterangeDate has invalid value of '%s'. Should be a Julian date integer or a datetime object." % (daterangeDate)) else: print_error(printing, self.__class__.__name__, sys._getframe().f_code.co_name, "daterangeDate not set.") return -1
def connectToSQLiteDB(self, dbFilePath="GoogleSearchResults.db", dbTableName="SearchResultURLs", printing=True): """ Args: dbFilePath: the file path of the SQLite database file. If not a .db file, it is corrected. e.g. "xxx/xxx/xxx.db" stays the same, whereas "xxx/xxx/xxx" becomes the former, and "xxx/xxx/" (i.e. a directory) bedomes "xxx/xxx/GoogleSearchResults.db" dbTableName: the SQLite table name to be referred to henceforth. printing: if we should print to terminal or not. Returns: True or False, depending on whether we have successfully connected to SQLite and created a usable table, or not. """ ## Correct common errors: if dbFilePath.endswith("/") or dbFilePath.endswith("\\"): dbFilePath+="GoogleSearchResults.db" if not dbFilePath.endswith(".db"): dbFilePath+=".db" try: self.conn=sqliteDefaults.get_conn(dbFilePath, printing) except Exception, e: print_error(printing, self.__class__.__name__, sys._getframe().f_code.co_name, "could not connect to SQLite database.", e) self.conn = None return False