Beispiel #1
0
 def __init__(self):
     '''
     Constructor
     '''
     self.__htmlElements = ['body', 'header', 'nav', 'footer', 'article', 'section', 'aside', 'div', 'span']
     self.__htmlAttributes = ['id', 'class']
     self.__htmlElementsSkip = ['script']
     self.__db = connectMySQL(db='xpath', port=3366)
     
     #=======================================================================
     # LXML CLEANER
     #=======================================================================
     self.__cleaner = Cleaner()
     self.__cleaner.javascript = True
     self.__cleaner.scripts = True
     self.__cleaner.style = True
     self.__cleaner.comments = True
     self.__cleaner.embedded = True
     self.__cleaner.frames = True
     self.__cleaner.meta = True
Beispiel #2
0
    def __init__(self):
        '''
        Constructor
        '''
        self.__htmlElements = [
            'body', 'header', 'nav', 'footer', 'article', 'section', 'aside',
            'div', 'span'
        ]
        self.__htmlAttributes = ['id', 'class']
        self.__htmlElementsSkip = ['script']
        self.__db = connectMySQL(db='xpath', port=3366)

        #=======================================================================
        # LXML CLEANER
        #=======================================================================
        self.__cleaner = Cleaner()
        self.__cleaner.javascript = True
        self.__cleaner.scripts = True
        self.__cleaner.style = True
        self.__cleaner.comments = True
        self.__cleaner.embedded = True
        self.__cleaner.frames = True
        self.__cleaner.meta = True
Beispiel #3
0
from TechDashAPI.mysqlUtilities import connectMySQL
from TechDashAPI.ContentExtractor import ContentExtractor
from TechDashAPI.ContentExtractorTrainer import ContentExtractorTrainer
from TechDashAPI.createDOM import createDom
from TechDashAPI.util import utilities
from TechDashAPI.topicModeling import techDashTopicModel

from gensim.models import LdaModel

db = connectMySQL(db='xpath', port=3366)
filesFolder = '/Users/jurica/Documents/workspace/eclipse/TechDashboard/xpathModels/'
utilitiesFunctions = utilities()

modelDestination = '/Users/jurica/Documents/workspace/eclipse/TechDashboard/modelsLDA/'
modelName ='fullModel_100P_20T'
model = LdaModel.load(modelDestination+modelName+'.lda',  mmap=None)
topicModel = techDashTopicModel(destination='/Users/jurica/Documents/workspace/eclipse/TechDashboard/modelsLDA/', fileName='fullModel', modelName='fullModel_100P_20T')

#===============================================================================
# UPDATE ALL ARTICLES TO NEW TOPICS
#===============================================================================

sqlQuery = """SELECT `xpathValuesXPath`.`xpathValuesID`, `xpathValuesXPath`.`xpathValuesContent` FROM `xpath`.`xpathValuesXPath`; """

db.executeQuery(sqlQuery)

for item in db._connectMySQL__results:
    #===========================================================================
    # print item
    #===========================================================================
    topicModelCat = topicModel.getDocumentTopics(item[1])
Beispiel #4
0
    def __init__(self, pwd, feedURL=''):

        #=======================================================================
        # https://news.ycombinator.com/rss
        # http://skimfeed.com/tech.html
        # https://gcn.com/rss-feeds/all.aspx',
        #=======================================================================

        if feedURL == '':
            self.__feedURL = [
                'http://feeds.news.com.au/public/rss/2.0/news_tech_506.xml',
                'http://www.cnet.com/rss/all/',
                'http://www.wired.com/category/gear/feed/',
                'http://www.wired.com/category/science/feed/',
                'http://www.infoworld.com/index.rss',
                'http://www.pcworld.com/index.rss',
                'http://www.computerworld.com/index.rss',
                'http://www.networkcomputing.com/rss_simple.asp',
                'http://www.engadget.com/rss-full.xml',
                'http://www.digitaltrends.com/feed/',
                'http://www.independent.co.uk/life-style/gadgets-and-tech/rss',
                'http://rss.nytimes.com/services/xml/rss/nyt/Technology.xml',
                'http://rss.nytimes.com/services/xml/rss/nyt/Science.xml',
                'http://feeds.reuters.com/reuters/technologyNews?format=xml',
                'http://feeds.reuters.com/reuters/scienceNews',
                'http://feeds.bbci.co.uk/news/technology/rss.xml',
                'http://feeds.feedburner.com/Technibble',
                'http://feeds.feedburner.com/TechCrunch/',
                'http://feeds.feedburner.com/techradar/allnews',
                'http://feeds.news.com.au/public/rss/2.0/news_tech_506.xml',
                'http://feeds.arstechnica.com/arstechnica/index?format=xml',
                'http://feeds2.feedburner.com/ziffdavis/pcmag/breakingnews'
            ]

        else:
            self.__feedURL = feedURL

        if isinstance(self.__feedURL, list):
            self.__etags = {}
            for item in self.__feedURL:
                #===============================================================
                # print item
                #===============================================================
                self.__etags[item] = {
                    #item : {
                    'etag': None,
                    'modified': None,
                    'feed': None,
                    'changed': False
                    #}
                }
        else:
            self.__etags = {
                self.__feedURL: {
                    'etag': None,
                    'modified': None,
                    'feed': None,
                    'changed': False
                }
            }

        self.__articleLinks = []
        self.__domainDBkey = None
        self.__db = connectMySQL(db='xpath', port=3366)
        self.__pwd = pwd
        self.__filesFolder = pwd + '/xpathModels/'
        self.__utilitiesFunctions = utilities()

        #=======================================================================
        # STANFORD NER
        #=======================================================================
        #=======================================================================
        # self.__extractNerStanford = CoreNLP( "nerparse",corenlp_jars=["/Users/jurica/Downloads/stanford-corenlp-full-2015-04-20/*"])
        #=======================================================================
        self.__spacyData_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
        self.__SpacyNLP = English(data_dir=self.__spacyData_dir)
Beispiel #5
0
    def __init__(self, pwd, feedURL= ''):
        
        #=======================================================================
        # https://news.ycombinator.com/rss
        # http://skimfeed.com/tech.html
        # https://gcn.com/rss-feeds/all.aspx',
        #=======================================================================
                
        if feedURL == '':
            self.__feedURL = [
                'http://feeds.news.com.au/public/rss/2.0/news_tech_506.xml',
                'http://www.cnet.com/rss/all/',
                'http://www.wired.com/category/gear/feed/',
                'http://www.wired.com/category/science/feed/',
                'http://www.infoworld.com/index.rss',
                'http://www.pcworld.com/index.rss',
                'http://www.computerworld.com/index.rss',
                'http://www.networkcomputing.com/rss_simple.asp',
                'http://www.engadget.com/rss-full.xml',
                'http://www.digitaltrends.com/feed/',
                'http://www.independent.co.uk/life-style/gadgets-and-tech/rss',
                'http://rss.nytimes.com/services/xml/rss/nyt/Technology.xml',
                'http://rss.nytimes.com/services/xml/rss/nyt/Science.xml',
                'http://feeds.reuters.com/reuters/technologyNews?format=xml',
                'http://feeds.reuters.com/reuters/scienceNews',
                'http://feeds.bbci.co.uk/news/technology/rss.xml',
                'http://feeds.feedburner.com/Technibble',
                'http://feeds.feedburner.com/TechCrunch/',
                'http://feeds.feedburner.com/techradar/allnews',
                'http://feeds.news.com.au/public/rss/2.0/news_tech_506.xml',
                'http://feeds.arstechnica.com/arstechnica/index?format=xml',
                'http://feeds2.feedburner.com/ziffdavis/pcmag/breakingnews'
            ]
            
        else:
            self.__feedURL=feedURL

        if isinstance(self.__feedURL, list):
            self.__etags = {}
            for item in self.__feedURL:
                #===============================================================
                # print item
                #===============================================================
                self.__etags[item] = {
                    #item : {
                        'etag' : None,
                        'modified': None,
                        'feed' : None,
                        'changed' : False
                        #}
                    }
        else:
            self.__etags = {
                    self.__feedURL : {
                    'etag' : None,
                    'modified': None,
                    'feed' : None,
                    'changed' : False
                    }
            }

        self.__articleLinks = []
        self.__domainDBkey = None
        self.__db = connectMySQL(db='xpath', port=3366)
        self.__pwd = pwd
        self.__filesFolder = pwd+'/xpathModels/'
        self.__utilitiesFunctions = utilities()
        
        #=======================================================================
        # STANFORD NER 
        #=======================================================================
        #=======================================================================
        # self.__extractNerStanford = CoreNLP( "nerparse",corenlp_jars=["/Users/jurica/Downloads/stanford-corenlp-full-2015-04-20/*"])
        #=======================================================================
        self.__spacyData_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
        self.__SpacyNLP = English(data_dir=self.__spacyData_dir)