def checkAndCleanText(self, inputText, rawData): """ Check and clean article text """ cleanedText = inputText invalidFlag = False try: for badString in self.invalidTextStrings: if cleanedText.find(badString) >= 0: logger.debug( "%s: Found invalid text strings in data extracted: %s", self.pluginName, badString) invalidFlag = True # check if article content is not valid or is too little if invalidFlag is True or len( cleanedText) < self.minArticleLengthInChars: cleanedText = self.extractArticleBody(rawData) # replace repeated spaces, tabs, hyphens, '\n', '\r\n', etc. cleanedText = filterRepeatedchars( cleanedText, deDupeList([' ', '\t', '\n', '\r\n', '-', '_', '.'])) cleanedText = cleanedText.replace('\n', ' ') # remove invalid substrings: for stringToFilter in deDupeList(self.subStringsToFilter): cleanedText = cleanedText.replace(stringToFilter, " ") except Exception as e: logger.error("Error cleaning text: %s", e) return (cleanedText)
def test_deDupeList(): # Test to deduplicate list (parentFolder, sourceFolder, testdataFolder) = getAppFolders() sys.path.append(sourceFolder) import scraper_utils listWithDuplicates = ['one', 'two', 'two', 'three'] resultList = scraper_utils.deDupeList(listWithDuplicates) print('Resultng list after de-duplicating:', resultList) assert len(resultList)==3 and 'two' in resultList, "8. deDupeList() is not de-duplicating lists correctly."
def checkAndCleanText(self, inputText, rawData): """ Check and clean article text """ cleanedText = inputText try: # ignore the newspaper extracted text, the alternate method text is more accurate: cleanedText = self.extractArticleBody(rawData) for badString in self.invalidTextStrings: if cleanedText.find(badString) >= 0: logger.debug("%s: Found invalid text strings in data extracted: %s", self.pluginName, badString) return(None) # replace repeated spaces, tabs, hyphens, '\n', '\r\n', etc. cleanedText = filterRepeatedchars(cleanedText, deDupeList([' ', '\t', '\n', '\r\n', '-', '_', '.'])) # remove invalid substrings: for stringToFilter in deDupeList(self.subStringsToFilter): cleanedText = cleanedText.replace(stringToFilter, " ") except Exception as e: logger.error("Error cleaning text: %s", e) return(cleanedText)
def setKeyWords(self, articleKeyWordsList): """ set the keywords in the article """ resultList = [] try: for keyword in articleKeyWordsList: # clean words, trim whitespace: resultList.append(NewsEvent.cleanText(keyword)) # de-duplicate the list resultList = deDupeList(resultList) except Exception as e: logger.error("Error cleaning keywords for article: %s", e) self.urlData["keywords"] = resultList
def addURLsToPendingTable(self, urlList: list, pluginName: str): """ Add newly identified URLs to the pending Table. Check duplicates using SQL: select count(*), url from pending_urls group by url having count(*)>1 :param urlList: List of URLs to be added :param pluginName: Name of the plugin for which the URLs are to be added. :return: """ sqlCon = None try: logger.debug( "Add URL list to pending table in db: Waiting to get db exclusive access..." ) acqResult = self.dbAccessSemaphore.acquire(timeout=30) if acqResult is True: logger.debug( "Adding URL list to pending table for plugin %s: Got exclusive db access.", pluginName) sqlCon = SessionHistory.openConnFromfile(self.dbFileName) cur = sqlCon.cursor() urlList = deDupeList(urlList) # TODO: parallelize this to process entire list in one iteration for sURL in urlList: # Ideally, if url already exists, the no. of attempts should be incremented by 1, but it is omitted cur.execute( 'insert or ignore into pending_urls (url, plugin_name, attempts) values (?, ?, 1)', (sURL, pluginName)) sqlCon.commit() except Exception as e: logger.error(f"Error while adding URL list to pending table: {e}") finally: if sqlCon: sqlCon.close() self.dbAccessSemaphore.release() logger.debug( f"Completed adding URL list to pending table for plugin {pluginName}:" + " Released exclusive db access.")
def retrieveTodoURLList(self, pluginName: str) -> list: """ Retrieve URL list from the pending_urls table for the given plugin name :param pluginName: The name of the plugin for which pending URLs need to be listed :return: List of pending URLs """ URLsFromSQLite = [] sqlCon = None try: logger.debug( "Fetching pending url list: Waiting for db exclusive access..." ) acqResult = self.dbAccessSemaphore.acquire() if acqResult is True: sqlQuery = "select distinct url from pending_urls where plugin_name='" + pluginName +\ "' and url not in (select url from failed_urls) and url not in (select url from url_list)" sqlCon = SessionHistory.openConnFromfile(self.dbFileName) cur = sqlCon.cursor() # execute query and get results: cur.execute(sqlQuery) allResults = cur.fetchall() # fill results into list for urlTuple in allResults: URLsFromSQLite.append(urlTuple[0]) except Exception as e: logger.error( "%s: Error when fetching pending url list from sqlite db: %s", pluginName, e) finally: if sqlCon: sqlCon.close() self.dbAccessSemaphore.release() logger.debug( "Fetched pending url list for plugin %s: Released exclusive db access.", pluginName) URLsFromSQLite = deDupeList(URLsFromSQLite) logger.info( f'{pluginName}: Identified {len(URLsFromSQLite)} URLs from pending table of history database.' ) return (URLsFromSQLite)
def getURLsListForDate(self, runDate: datetime, sessionHistoryDB: SessionHistory) -> list: """ Extract article list from the main URL. Since this is only a news aggregator, sets the plugin state to Types.STATE_STOPPED at the end of this method. :param sessionHistoryDB: Not used in this function :param runDate: Given query date for which news article URLs are to be retrieved :type runDate: datetime.datetime :return: List of URLs identified from this news source :rtype: list """ urlList = [] try: searchResultsURLForDate, dataDirForDate = self.prepare_url_datadir_for_date( runDate) if searchResultsURLForDate is not None: logger.debug('Downloading file from URL: %s', searchResultsURLForDate) csv_zip = self.downloadDataArchive(searchResultsURLForDate, self.pluginName) csv_files = mod_in_gdelt.extract_csvlist_from_archive( csv_zip, dataDirForDate) for csv_filename in csv_files: logger.debug("Expanded the fetched Zip archive to: %s", csv_filename) url_items = mod_in_gdelt.extract_urls_from_csv( csv_filename, country_code='IN') urlList = urlList + url_items urlList = deDupeList(urlList) logger.info("Added %s URLs from aggregated news from %s", len(urlList), searchResultsURLForDate) except Exception as e: logger.error( "%s: When Extracting URL list from main URL, error was: %s", self.pluginName, e) self.pluginState = Types.STATE_STOPPED return (urlList)
def extract_urls_from_csv(csv_filename: str, country_code='IN') -> list: """ Extract URL list from CSV file :param csv_filename: file to read from :param country_code: ISO 2-character country code to filter news :return: List of relevant URLs extracted from CSV file """ urlList = [] # load csv file in pandas: # Columns (14,24) have mixed types. Specify dtype option on import or set low_memory=False. urlDF = pd.read_csv(csv_filename, delimiter='\t', header=None, low_memory=False) # filter and identify URLs for india: # column 51 is country, column 57 is URL for item in urlDF[urlDF.iloc[:, 51] == country_code].iloc[:, 57].values: # put urls into list: urlList.append(item.strip()) # delete csv file: os.remove(csv_filename) return (deDupeList(urlList))