def __init__(self): ''' Constructor ''' self.__htmlElements = ['body', 'header', 'nav', 'footer', 'article', 'section', 'aside', 'div', 'span'] self.__htmlAttributes = ['id', 'class'] self.__htmlElementsSkip = ['script'] self.__db = connectMySQL(db='xpath', port=3366) #======================================================================= # LXML CLEANER #======================================================================= self.__cleaner = Cleaner() self.__cleaner.javascript = True self.__cleaner.scripts = True self.__cleaner.style = True self.__cleaner.comments = True self.__cleaner.embedded = True self.__cleaner.frames = True self.__cleaner.meta = True
def __init__(self): ''' Constructor ''' self.__htmlElements = [ 'body', 'header', 'nav', 'footer', 'article', 'section', 'aside', 'div', 'span' ] self.__htmlAttributes = ['id', 'class'] self.__htmlElementsSkip = ['script'] self.__db = connectMySQL(db='xpath', port=3366) #======================================================================= # LXML CLEANER #======================================================================= self.__cleaner = Cleaner() self.__cleaner.javascript = True self.__cleaner.scripts = True self.__cleaner.style = True self.__cleaner.comments = True self.__cleaner.embedded = True self.__cleaner.frames = True self.__cleaner.meta = True
from TechDashAPI.mysqlUtilities import connectMySQL from TechDashAPI.ContentExtractor import ContentExtractor from TechDashAPI.ContentExtractorTrainer import ContentExtractorTrainer from TechDashAPI.createDOM import createDom from TechDashAPI.util import utilities from TechDashAPI.topicModeling import techDashTopicModel from gensim.models import LdaModel db = connectMySQL(db='xpath', port=3366) filesFolder = '/Users/jurica/Documents/workspace/eclipse/TechDashboard/xpathModels/' utilitiesFunctions = utilities() modelDestination = '/Users/jurica/Documents/workspace/eclipse/TechDashboard/modelsLDA/' modelName ='fullModel_100P_20T' model = LdaModel.load(modelDestination+modelName+'.lda', mmap=None) topicModel = techDashTopicModel(destination='/Users/jurica/Documents/workspace/eclipse/TechDashboard/modelsLDA/', fileName='fullModel', modelName='fullModel_100P_20T') #=============================================================================== # UPDATE ALL ARTICLES TO NEW TOPICS #=============================================================================== sqlQuery = """SELECT `xpathValuesXPath`.`xpathValuesID`, `xpathValuesXPath`.`xpathValuesContent` FROM `xpath`.`xpathValuesXPath`; """ db.executeQuery(sqlQuery) for item in db._connectMySQL__results: #=========================================================================== # print item #=========================================================================== topicModelCat = topicModel.getDocumentTopics(item[1])
def __init__(self, pwd, feedURL=''): #======================================================================= # https://news.ycombinator.com/rss # http://skimfeed.com/tech.html # https://gcn.com/rss-feeds/all.aspx', #======================================================================= if feedURL == '': self.__feedURL = [ 'http://feeds.news.com.au/public/rss/2.0/news_tech_506.xml', 'http://www.cnet.com/rss/all/', 'http://www.wired.com/category/gear/feed/', 'http://www.wired.com/category/science/feed/', 'http://www.infoworld.com/index.rss', 'http://www.pcworld.com/index.rss', 'http://www.computerworld.com/index.rss', 'http://www.networkcomputing.com/rss_simple.asp', 'http://www.engadget.com/rss-full.xml', 'http://www.digitaltrends.com/feed/', 'http://www.independent.co.uk/life-style/gadgets-and-tech/rss', 'http://rss.nytimes.com/services/xml/rss/nyt/Technology.xml', 'http://rss.nytimes.com/services/xml/rss/nyt/Science.xml', 'http://feeds.reuters.com/reuters/technologyNews?format=xml', 'http://feeds.reuters.com/reuters/scienceNews', 'http://feeds.bbci.co.uk/news/technology/rss.xml', 'http://feeds.feedburner.com/Technibble', 'http://feeds.feedburner.com/TechCrunch/', 'http://feeds.feedburner.com/techradar/allnews', 'http://feeds.news.com.au/public/rss/2.0/news_tech_506.xml', 'http://feeds.arstechnica.com/arstechnica/index?format=xml', 'http://feeds2.feedburner.com/ziffdavis/pcmag/breakingnews' ] else: self.__feedURL = feedURL if isinstance(self.__feedURL, list): self.__etags = {} for item in self.__feedURL: #=============================================================== # print item #=============================================================== self.__etags[item] = { #item : { 'etag': None, 'modified': None, 'feed': None, 'changed': False #} } else: self.__etags = { self.__feedURL: { 'etag': None, 'modified': None, 'feed': None, 'changed': False } } self.__articleLinks = [] self.__domainDBkey = None self.__db = connectMySQL(db='xpath', port=3366) self.__pwd = pwd self.__filesFolder = pwd + '/xpathModels/' self.__utilitiesFunctions = utilities() #======================================================================= # STANFORD NER #======================================================================= #======================================================================= # self.__extractNerStanford = CoreNLP( "nerparse",corenlp_jars=["/Users/jurica/Downloads/stanford-corenlp-full-2015-04-20/*"]) #======================================================================= self.__spacyData_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR) self.__SpacyNLP = English(data_dir=self.__spacyData_dir)
def __init__(self, pwd, feedURL= ''): #======================================================================= # https://news.ycombinator.com/rss # http://skimfeed.com/tech.html # https://gcn.com/rss-feeds/all.aspx', #======================================================================= if feedURL == '': self.__feedURL = [ 'http://feeds.news.com.au/public/rss/2.0/news_tech_506.xml', 'http://www.cnet.com/rss/all/', 'http://www.wired.com/category/gear/feed/', 'http://www.wired.com/category/science/feed/', 'http://www.infoworld.com/index.rss', 'http://www.pcworld.com/index.rss', 'http://www.computerworld.com/index.rss', 'http://www.networkcomputing.com/rss_simple.asp', 'http://www.engadget.com/rss-full.xml', 'http://www.digitaltrends.com/feed/', 'http://www.independent.co.uk/life-style/gadgets-and-tech/rss', 'http://rss.nytimes.com/services/xml/rss/nyt/Technology.xml', 'http://rss.nytimes.com/services/xml/rss/nyt/Science.xml', 'http://feeds.reuters.com/reuters/technologyNews?format=xml', 'http://feeds.reuters.com/reuters/scienceNews', 'http://feeds.bbci.co.uk/news/technology/rss.xml', 'http://feeds.feedburner.com/Technibble', 'http://feeds.feedburner.com/TechCrunch/', 'http://feeds.feedburner.com/techradar/allnews', 'http://feeds.news.com.au/public/rss/2.0/news_tech_506.xml', 'http://feeds.arstechnica.com/arstechnica/index?format=xml', 'http://feeds2.feedburner.com/ziffdavis/pcmag/breakingnews' ] else: self.__feedURL=feedURL if isinstance(self.__feedURL, list): self.__etags = {} for item in self.__feedURL: #=============================================================== # print item #=============================================================== self.__etags[item] = { #item : { 'etag' : None, 'modified': None, 'feed' : None, 'changed' : False #} } else: self.__etags = { self.__feedURL : { 'etag' : None, 'modified': None, 'feed' : None, 'changed' : False } } self.__articleLinks = [] self.__domainDBkey = None self.__db = connectMySQL(db='xpath', port=3366) self.__pwd = pwd self.__filesFolder = pwd+'/xpathModels/' self.__utilitiesFunctions = utilities() #======================================================================= # STANFORD NER #======================================================================= #======================================================================= # self.__extractNerStanford = CoreNLP( "nerparse",corenlp_jars=["/Users/jurica/Downloads/stanford-corenlp-full-2015-04-20/*"]) #======================================================================= self.__spacyData_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR) self.__SpacyNLP = English(data_dir=self.__spacyData_dir)