Python deDupeListの例、scraper_utils.deDupeList Pythonの例

コード例 #1

0

ファイルを表示

 def checkAndCleanText(self, inputText, rawData):
     """ Check and clean article text
     """
     cleanedText = inputText
     invalidFlag = False
     try:
         for badString in self.invalidTextStrings:
             if cleanedText.find(badString) >= 0:
                 logger.debug(
                     "%s: Found invalid text strings in data extracted: %s",
                     self.pluginName, badString)
                 invalidFlag = True
         # check if article content is not valid or is too little
         if invalidFlag is True or len(
                 cleanedText) < self.minArticleLengthInChars:
             cleanedText = self.extractArticleBody(rawData)
         # replace repeated spaces, tabs, hyphens, '\n', '\r\n', etc.
         cleanedText = filterRepeatedchars(
             cleanedText,
             deDupeList([' ', '\t', '\n', '\r\n', '-', '_', '.']))
         cleanedText = cleanedText.replace('\n', ' ')
         # remove invalid substrings:
         for stringToFilter in deDupeList(self.subStringsToFilter):
             cleanedText = cleanedText.replace(stringToFilter, " ")
     except Exception as e:
         logger.error("Error cleaning text: %s", e)
     return (cleanedText)

コード例 #2

0

ファイルを表示

def test_deDupeList():
    # Test to deduplicate list
    (parentFolder, sourceFolder, testdataFolder) = getAppFolders()
    sys.path.append(sourceFolder)
    import scraper_utils
    listWithDuplicates = ['one', 'two', 'two', 'three']
    resultList = scraper_utils.deDupeList(listWithDuplicates)
    print('Resultng list after de-duplicating:', resultList)
    assert len(resultList)==3 and 'two' in resultList, "8. deDupeList() is not de-duplicating lists correctly."

コード例 #3

0

ファイルを表示

 def checkAndCleanText(self, inputText, rawData):
     """ Check and clean article text
     """
     cleanedText = inputText
     try:
         # ignore the newspaper extracted text, the alternate method text is more accurate:
         cleanedText = self.extractArticleBody(rawData)
         for badString in self.invalidTextStrings:
             if cleanedText.find(badString) >= 0:
                 logger.debug("%s: Found invalid text strings in data extracted: %s", self.pluginName, badString)
                 return(None)
         # replace repeated spaces, tabs, hyphens, '\n', '\r\n', etc.
         cleanedText = filterRepeatedchars(cleanedText,
                                           deDupeList([' ', '\t', '\n', '\r\n', '-', '_', '.']))
         # remove invalid substrings:
         for stringToFilter in deDupeList(self.subStringsToFilter):
             cleanedText = cleanedText.replace(stringToFilter, " ")
     except Exception as e:
         logger.error("Error cleaning text: %s", e)
     return(cleanedText)

コード例 #4

0

ファイルを表示

ファイル: news_event.py プロジェクト: sandeep-sandhu/NewsLookout

 def setKeyWords(self, articleKeyWordsList):
     """ set the keywords in the article
     """
     resultList = []
     try:
         for keyword in articleKeyWordsList:
             # clean words, trim whitespace:
             resultList.append(NewsEvent.cleanText(keyword))
         # de-duplicate the list
         resultList = deDupeList(resultList)
     except Exception as e:
         logger.error("Error cleaning keywords for article: %s", e)
     self.urlData["keywords"] = resultList

コード例 #5

0

ファイルを表示

ファイル: session_hist.py プロジェクト: sandeep-sandhu/NewsLookout

    def addURLsToPendingTable(self, urlList: list, pluginName: str):
        """ Add newly identified URLs to the pending Table.

        Check duplicates using SQL:
        select count(*), url from pending_urls group by url having count(*)>1

        :param urlList: List of URLs to be added
        :param pluginName: Name of the plugin for which the URLs are to be added.
        :return:
        """
        sqlCon = None
        try:
            logger.debug(
                "Add URL list to pending table in db: Waiting to get db exclusive access..."
            )
            acqResult = self.dbAccessSemaphore.acquire(timeout=30)
            if acqResult is True:
                logger.debug(
                    "Adding URL list to pending table for plugin %s: Got exclusive db access.",
                    pluginName)
                sqlCon = SessionHistory.openConnFromfile(self.dbFileName)
                cur = sqlCon.cursor()
                urlList = deDupeList(urlList)
                # TODO: parallelize this to process entire list in one iteration
                for sURL in urlList:
                    # Ideally, if url already exists, the no. of attempts should be incremented by 1, but it is omitted
                    cur.execute(
                        'insert or ignore into pending_urls (url, plugin_name, attempts) values (?, ?, 1)',
                        (sURL, pluginName))
                sqlCon.commit()
        except Exception as e:
            logger.error(f"Error while adding URL list to pending table: {e}")
        finally:
            if sqlCon:
                sqlCon.close()
            self.dbAccessSemaphore.release()
            logger.debug(
                f"Completed adding URL list to pending table for plugin {pluginName}:"
                + " Released exclusive db access.")

コード例 #6

0

ファイルを表示

ファイル: session_hist.py プロジェクト: sandeep-sandhu/NewsLookout

    def retrieveTodoURLList(self, pluginName: str) -> list:
        """ Retrieve URL list from the pending_urls table for the given plugin name

        :param pluginName: The name of the plugin for which pending URLs need to be listed
        :return: List of pending URLs
        """
        URLsFromSQLite = []
        sqlCon = None
        try:
            logger.debug(
                "Fetching pending url list: Waiting for db exclusive access..."
            )
            acqResult = self.dbAccessSemaphore.acquire()
            if acqResult is True:
                sqlQuery = "select distinct url from pending_urls where plugin_name='" + pluginName +\
                           "' and url not in (select url from failed_urls) and url not in (select url from url_list)"
                sqlCon = SessionHistory.openConnFromfile(self.dbFileName)
                cur = sqlCon.cursor()
                # execute query and get results:
                cur.execute(sqlQuery)
                allResults = cur.fetchall()  # fill results into list
                for urlTuple in allResults:
                    URLsFromSQLite.append(urlTuple[0])
        except Exception as e:
            logger.error(
                "%s: Error when fetching pending url list from sqlite db: %s",
                pluginName, e)
        finally:
            if sqlCon:
                sqlCon.close()
            self.dbAccessSemaphore.release()
            logger.debug(
                "Fetched pending url list for plugin %s: Released exclusive db access.",
                pluginName)
        URLsFromSQLite = deDupeList(URLsFromSQLite)
        logger.info(
            f'{pluginName}: Identified {len(URLsFromSQLite)} URLs from pending table of history database.'
        )
        return (URLsFromSQLite)

コード例 #7

0

ファイルを表示

ファイル: mod_in_gdelt.py プロジェクト: sandeep-sandhu/NewsLookout

    def getURLsListForDate(self, runDate: datetime,
                           sessionHistoryDB: SessionHistory) -> list:
        """ Extract article list from the main URL.
        Since this is only a news aggregator, sets the plugin state to Types.STATE_STOPPED
         at the end of this method.

        :param sessionHistoryDB: Not used in this function
        :param runDate: Given query date for which news article URLs are to be retrieved
        :type runDate: datetime.datetime
        :return: List of URLs identified from this news source
        :rtype: list
        """
        urlList = []
        try:
            searchResultsURLForDate, dataDirForDate = self.prepare_url_datadir_for_date(
                runDate)
            if searchResultsURLForDate is not None:
                logger.debug('Downloading file from URL: %s',
                             searchResultsURLForDate)
                csv_zip = self.downloadDataArchive(searchResultsURLForDate,
                                                   self.pluginName)
                csv_files = mod_in_gdelt.extract_csvlist_from_archive(
                    csv_zip, dataDirForDate)
                for csv_filename in csv_files:
                    logger.debug("Expanded the fetched Zip archive to: %s",
                                 csv_filename)
                    url_items = mod_in_gdelt.extract_urls_from_csv(
                        csv_filename, country_code='IN')
                    urlList = urlList + url_items
                urlList = deDupeList(urlList)
            logger.info("Added %s URLs from aggregated news from %s",
                        len(urlList), searchResultsURLForDate)
        except Exception as e:
            logger.error(
                "%s: When Extracting URL list from main URL, error was: %s",
                self.pluginName, e)
        self.pluginState = Types.STATE_STOPPED
        return (urlList)

コード例 #8

0

ファイルを表示

ファイル: mod_in_gdelt.py プロジェクト: sandeep-sandhu/NewsLookout

    def extract_urls_from_csv(csv_filename: str, country_code='IN') -> list:
        """ Extract URL list from CSV file

        :param csv_filename: file to read from
        :param country_code: ISO 2-character country code to filter news
        :return: List of relevant URLs extracted from CSV file
        """
        urlList = []
        # load csv file in pandas:
        # Columns (14,24) have mixed types. Specify dtype option on import or set low_memory=False.
        urlDF = pd.read_csv(csv_filename,
                            delimiter='\t',
                            header=None,
                            low_memory=False)
        # filter and identify URLs for india:
        # column 51 is country, column 57 is URL
        for item in urlDF[urlDF.iloc[:, 51] == country_code].iloc[:,
                                                                  57].values:
            # put urls into list:
            urlList.append(item.strip())
        # delete csv file:
        os.remove(csv_filename)
        return (deDupeList(urlList))