Example #1
0
class ContentExtractor(object):
    '''
    classdocs
    '''
    '''
    steps:
        get serialized json file
    '''
    @profile
    def __init__(self,
                 domain,
                 htmlFileURL,
                 pwd,
                 CoreNLPner='',
                 spacyNER='',
                 dbConnection=''):
        '''
        Constructor
        '''
        self.__fileURL = htmlFileURL
        self.__domainDBkey = domain

        try:
            self.__XpathList = pickle.load(
                open(
                    pwd + '/xpathModels/' + str(self.__domainDBkey) +
                    '.pickle', 'rb'))
            #===================================================================
            # self.__XpathListID = pickle.load(open('/Users/jurica/Documents/workspace/eclipse/TechDashboard/xpathModels/'+str(self.__domainDBkey)+'_ID.pickle', 'rb'))
            # self.__XpathListNoAttrib = pickle.load(open('/Users/jurica/Documents/workspace/eclipse/TechDashboard/xpathModels/'+str(self.__domainDBkey)+'_NoAttrib.pickle', 'rb'))
            #===================================================================
        except PicklingError, e:
            print e

        self.__htmlElements = [
            'body', 'header', 'nav', 'footer', 'article', 'section', 'aside',
            'div', 'span'
        ]
        self.__htmlAttributes = ['id', 'class']
        self.__documentIDKey = ''
        self.__utilitiesFunctions = utilities()

        #DB CONNECTIVITY AND FUNCTIONALITY
        self.__db = connectMySQL(db='xpath', port=3366)
        self.__topicModel = techDashTopicModel(destination=pwd + '/modelsLDA/',
                                               fileName='fullModel',
                                               modelName='fullModel_100P_20T')

        #=======================================================================
        # OPEN URL
        #=======================================================================
        url2Open, self.__htmlFile = self.__utilitiesFunctions.openULR(
            self.__fileURL)

        #=======================================================================
        # NER
        #=======================================================================
        self.__extractNerStanford = CoreNLPner
        self.__extractNerSpacy = spacyNER
Example #2
0
from TechDashAPI.mysqlUtilities import connectMySQL
from TechDashAPI.ContentExtractor import ContentExtractor
from TechDashAPI.ContentExtractorTrainer import ContentExtractorTrainer
from TechDashAPI.createDOM import createDom
from TechDashAPI.util import utilities
from TechDashAPI.topicModeling import techDashTopicModel

from gensim.models import LdaModel

db = connectMySQL(db='xpath', port=3366)
filesFolder = '/Users/jurica/Documents/workspace/eclipse/TechDashboard/xpathModels/'
utilitiesFunctions = utilities()

modelDestination = '/Users/jurica/Documents/workspace/eclipse/TechDashboard/modelsLDA/'
modelName ='fullModel_100P_20T'
model = LdaModel.load(modelDestination+modelName+'.lda',  mmap=None)
topicModel = techDashTopicModel(destination='/Users/jurica/Documents/workspace/eclipse/TechDashboard/modelsLDA/', fileName='fullModel', modelName='fullModel_100P_20T')

#===============================================================================
# UPDATE ALL ARTICLES TO NEW TOPICS
#===============================================================================

sqlQuery = """SELECT `xpathValuesXPath`.`xpathValuesID`, `xpathValuesXPath`.`xpathValuesContent` FROM `xpath`.`xpathValuesXPath`; """

db.executeQuery(sqlQuery)

for item in db._connectMySQL__results:
    #===========================================================================
    # print item
    #===========================================================================
    topicModelCat = topicModel.getDocumentTopics(item[1])
Example #3
0
    def __init__(self, pwd, feedURL=''):

        #=======================================================================
        # https://news.ycombinator.com/rss
        # http://skimfeed.com/tech.html
        # https://gcn.com/rss-feeds/all.aspx',
        #=======================================================================

        if feedURL == '':
            self.__feedURL = [
                'http://feeds.news.com.au/public/rss/2.0/news_tech_506.xml',
                'http://www.cnet.com/rss/all/',
                'http://www.wired.com/category/gear/feed/',
                'http://www.wired.com/category/science/feed/',
                'http://www.infoworld.com/index.rss',
                'http://www.pcworld.com/index.rss',
                'http://www.computerworld.com/index.rss',
                'http://www.networkcomputing.com/rss_simple.asp',
                'http://www.engadget.com/rss-full.xml',
                'http://www.digitaltrends.com/feed/',
                'http://www.independent.co.uk/life-style/gadgets-and-tech/rss',
                'http://rss.nytimes.com/services/xml/rss/nyt/Technology.xml',
                'http://rss.nytimes.com/services/xml/rss/nyt/Science.xml',
                'http://feeds.reuters.com/reuters/technologyNews?format=xml',
                'http://feeds.reuters.com/reuters/scienceNews',
                'http://feeds.bbci.co.uk/news/technology/rss.xml',
                'http://feeds.feedburner.com/Technibble',
                'http://feeds.feedburner.com/TechCrunch/',
                'http://feeds.feedburner.com/techradar/allnews',
                'http://feeds.news.com.au/public/rss/2.0/news_tech_506.xml',
                'http://feeds.arstechnica.com/arstechnica/index?format=xml',
                'http://feeds2.feedburner.com/ziffdavis/pcmag/breakingnews'
            ]

        else:
            self.__feedURL = feedURL

        if isinstance(self.__feedURL, list):
            self.__etags = {}
            for item in self.__feedURL:
                #===============================================================
                # print item
                #===============================================================
                self.__etags[item] = {
                    #item : {
                    'etag': None,
                    'modified': None,
                    'feed': None,
                    'changed': False
                    #}
                }
        else:
            self.__etags = {
                self.__feedURL: {
                    'etag': None,
                    'modified': None,
                    'feed': None,
                    'changed': False
                }
            }

        self.__articleLinks = []
        self.__domainDBkey = None
        self.__db = connectMySQL(db='xpath', port=3366)
        self.__pwd = pwd
        self.__filesFolder = pwd + '/xpathModels/'
        self.__utilitiesFunctions = utilities()

        #=======================================================================
        # STANFORD NER
        #=======================================================================
        #=======================================================================
        # self.__extractNerStanford = CoreNLP( "nerparse",corenlp_jars=["/Users/jurica/Downloads/stanford-corenlp-full-2015-04-20/*"])
        #=======================================================================
        self.__spacyData_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
        self.__SpacyNLP = English(data_dir=self.__spacyData_dir)
    def __init__(self, domain, htmlFileURL, pwd, dbConnection='', path=''):
        #=======================================================================
        # LOGGING INFORMATION
        #=======================================================================s
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
        
        #=======================================================================
        # INITIAL VARIABLES
        #=======================================================================
        self.__domain = str(domain)
        self.__htmlFileURL = htmlFileURL
        self.__xpathPaths = []
        #=======================================================================
        # self.__xpathPathsNoAttrib = []
        #=======================================================================
        #=======================================================================
        # self.__xpathPathsID = []
        #=======================================================================
        
        #=======================================================================
        # SET UP THE DIRECTORY STRUCTURE; WHERE THE FILES ARE/WILL BE STORED
        #=======================================================================
        self.__dictionaryPath = pwd+'/xpathModels/'
        self.__domainResourcesFile = self.__dictionaryPath + self.__domain
        
        #=======================================================================
        # DICTIONARY WITH KNOWLEDGE INFORMATION, only need when using 2-step xpath list creation, DEPRECATED
        #=======================================================================
        
        #=======================================================================
        # self.__domainDoctionary = json.load(open(self.__domainResourcesFile + '.json', 'rb'))
        # self.__domainDoctionaryXML = ET.parse(self.__domainResourcesFile + '.xml')
        #=======================================================================
        
        #=======================================================================
        # STRUCTURE AND CONTENT ELEMENTS FOR XPATH CREATION
        #=======================================================================
        self.__htmlElements = ['body', 'header', 'nav', 'footer', 'article', 'section', 'aside', 'div', 'span']
        self.__htmlAttributes = ['id', 'class']
        self.__htmlElementsSkip = ['script','style']
        
        #=======================================================================
        # LOAD BACKGROUND KNOWLEDGE
        #=======================================================================
        try:
        #=======================================================================
        # if os.path.isfile(self.__dictionaryPath + self.__domain + '_bckKnowledge.pickle'):
        #=======================================================================
            self.__htmlFileBackgroundKnowledge = pickle.load(open(self.__domainResourcesFile + '_bckKnowledge.pickle', 'rb'))
        except:
            self.__htmlFileBackgroundKnowledge = {}
            print traceback.print_exc()
            
        #=======================================================================
        # SET UP K-MEANS AND DEFINE CLUSTER CENTERS
        #=======================================================================
        try:
            #===================================================================
            # if os.path.isfile(self.__dictionaryPath + self.__domain + '_centroids.pickle'):
            #===================================================================
            centroids = pickle.load(open(self.__domainResourcesFile + '_centroids.pickle', 'rb'))
            self.__kMeansValues = KMeans(n_clusters=2, init=centroids)
        except:
            #===================================================================
            # print traceback.print_exc()
            #===================================================================
            self.__kMeansValues = KMeans(n_clusters=2)

        #=======================================================================
        # UTILITIES FUNCTION
        #=======================================================================
        self.__utilitiesFunctions = utilities()
        url2Open, self.__htmlFile = self.__utilitiesFunctions.openULR(self.__htmlFileURL)
Example #5
0
    def __init__(self, pwd, feedURL= ''):
        
        #=======================================================================
        # https://news.ycombinator.com/rss
        # http://skimfeed.com/tech.html
        # https://gcn.com/rss-feeds/all.aspx',
        #=======================================================================
                
        if feedURL == '':
            self.__feedURL = [
                'http://feeds.news.com.au/public/rss/2.0/news_tech_506.xml',
                'http://www.cnet.com/rss/all/',
                'http://www.wired.com/category/gear/feed/',
                'http://www.wired.com/category/science/feed/',
                'http://www.infoworld.com/index.rss',
                'http://www.pcworld.com/index.rss',
                'http://www.computerworld.com/index.rss',
                'http://www.networkcomputing.com/rss_simple.asp',
                'http://www.engadget.com/rss-full.xml',
                'http://www.digitaltrends.com/feed/',
                'http://www.independent.co.uk/life-style/gadgets-and-tech/rss',
                'http://rss.nytimes.com/services/xml/rss/nyt/Technology.xml',
                'http://rss.nytimes.com/services/xml/rss/nyt/Science.xml',
                'http://feeds.reuters.com/reuters/technologyNews?format=xml',
                'http://feeds.reuters.com/reuters/scienceNews',
                'http://feeds.bbci.co.uk/news/technology/rss.xml',
                'http://feeds.feedburner.com/Technibble',
                'http://feeds.feedburner.com/TechCrunch/',
                'http://feeds.feedburner.com/techradar/allnews',
                'http://feeds.news.com.au/public/rss/2.0/news_tech_506.xml',
                'http://feeds.arstechnica.com/arstechnica/index?format=xml',
                'http://feeds2.feedburner.com/ziffdavis/pcmag/breakingnews'
            ]
            
        else:
            self.__feedURL=feedURL

        if isinstance(self.__feedURL, list):
            self.__etags = {}
            for item in self.__feedURL:
                #===============================================================
                # print item
                #===============================================================
                self.__etags[item] = {
                    #item : {
                        'etag' : None,
                        'modified': None,
                        'feed' : None,
                        'changed' : False
                        #}
                    }
        else:
            self.__etags = {
                    self.__feedURL : {
                    'etag' : None,
                    'modified': None,
                    'feed' : None,
                    'changed' : False
                    }
            }

        self.__articleLinks = []
        self.__domainDBkey = None
        self.__db = connectMySQL(db='xpath', port=3366)
        self.__pwd = pwd
        self.__filesFolder = pwd+'/xpathModels/'
        self.__utilitiesFunctions = utilities()
        
        #=======================================================================
        # STANFORD NER 
        #=======================================================================
        #=======================================================================
        # self.__extractNerStanford = CoreNLP( "nerparse",corenlp_jars=["/Users/jurica/Downloads/stanford-corenlp-full-2015-04-20/*"])
        #=======================================================================
        self.__spacyData_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
        self.__SpacyNLP = English(data_dir=self.__spacyData_dir)
Example #6
0
    def __init__(self, domain, htmlFileURL, pwd, dbConnection='', path=''):
        #=======================================================================
        # LOGGING INFORMATION
        #=======================================================================s
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                            level=logging.INFO)

        #=======================================================================
        # INITIAL VARIABLES
        #=======================================================================
        self.__domain = str(domain)
        self.__htmlFileURL = htmlFileURL
        self.__xpathPaths = []
        #=======================================================================
        # self.__xpathPathsNoAttrib = []
        #=======================================================================
        #=======================================================================
        # self.__xpathPathsID = []
        #=======================================================================

        #=======================================================================
        # SET UP THE DIRECTORY STRUCTURE; WHERE THE FILES ARE/WILL BE STORED
        #=======================================================================
        self.__dictionaryPath = pwd + '/xpathModels/'
        self.__domainResourcesFile = self.__dictionaryPath + self.__domain

        #=======================================================================
        # DICTIONARY WITH KNOWLEDGE INFORMATION, only need when using 2-step xpath list creation, DEPRECATED
        #=======================================================================

        #=======================================================================
        # self.__domainDoctionary = json.load(open(self.__domainResourcesFile + '.json', 'rb'))
        # self.__domainDoctionaryXML = ET.parse(self.__domainResourcesFile + '.xml')
        #=======================================================================

        #=======================================================================
        # STRUCTURE AND CONTENT ELEMENTS FOR XPATH CREATION
        #=======================================================================
        self.__htmlElements = [
            'body', 'header', 'nav', 'footer', 'article', 'section', 'aside',
            'div', 'span'
        ]
        self.__htmlAttributes = ['id', 'class']
        self.__htmlElementsSkip = ['script', 'style']

        #=======================================================================
        # LOAD BACKGROUND KNOWLEDGE
        #=======================================================================
        try:
            #=======================================================================
            # if os.path.isfile(self.__dictionaryPath + self.__domain + '_bckKnowledge.pickle'):
            #=======================================================================
            self.__htmlFileBackgroundKnowledge = pickle.load(
                open(self.__domainResourcesFile + '_bckKnowledge.pickle',
                     'rb'))
        except:
            self.__htmlFileBackgroundKnowledge = {}
            print traceback.print_exc()

        #=======================================================================
        # SET UP K-MEANS AND DEFINE CLUSTER CENTERS
        #=======================================================================
        try:
            #===================================================================
            # if os.path.isfile(self.__dictionaryPath + self.__domain + '_centroids.pickle'):
            #===================================================================
            centroids = pickle.load(
                open(self.__domainResourcesFile + '_centroids.pickle', 'rb'))
            self.__kMeansValues = KMeans(n_clusters=2, init=centroids)
        except:
            #===================================================================
            # print traceback.print_exc()
            #===================================================================
            self.__kMeansValues = KMeans(n_clusters=2)

        #=======================================================================
        # UTILITIES FUNCTION
        #=======================================================================
        self.__utilitiesFunctions = utilities()
        url2Open, self.__htmlFile = self.__utilitiesFunctions.openULR(
            self.__htmlFileURL)