def __init__(self, destination, fileName, modelName='', ldaPasses='', topicNum=''): ''' Constructor ''' logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) self.__destination = destination self.__fileName = fileName self.__modelName = modelName self.__ldaPasses = ldaPasses self.__topicNum = topicNum #======================================================================= # STOP WORDS AND CAHRACTERS #======================================================================= self.__stopwords = stopwords.words('english')# + string.punctuation self.__chars_to_remove = [u'[', u']', u'(', u')', u'*', u'%', u'{', u'}', u'\n', u'\n\n', u'\t', u';',u'/',u'^',u'--',u'\\',u'+',u'-',u'.',u'?',u'&',u'#',u'',u''] self.__stopwords.extend(self.__chars_to_remove) self.__stopwords.extend([item for item in string.punctuation]) #======================================================================= # DATABASE #======================================================================= self.__db = connectMySQL(db='xpath', port=3366) self.__queryResults = None self.__cleanedCorpus = [] if modelName != '' and os.path.exists(self.__destination+modelName+'.lda'): self.__ldaModel = LdaModel.load(self.__destination+modelName+'.lda', mmap='r') if fileName != '' and os.path.exists(self.__destination+fileName+'.dict'): self.__modelDict = corpora.Dictionary.load(self.__destination+fileName+'.dict')
class ContentExtractor(object): ''' classdocs ''' ''' steps: get serialized json file ''' @profile def __init__(self, domain, htmlFileURL, pwd, CoreNLPner='', spacyNER='', dbConnection=''): ''' Constructor ''' self.__fileURL = htmlFileURL self.__domainDBkey = domain try: self.__XpathList = pickle.load( open( pwd + '/xpathModels/' + str(self.__domainDBkey) + '.pickle', 'rb')) #=================================================================== # self.__XpathListID = pickle.load(open('/Users/jurica/Documents/workspace/eclipse/TechDashboard/xpathModels/'+str(self.__domainDBkey)+'_ID.pickle', 'rb')) # self.__XpathListNoAttrib = pickle.load(open('/Users/jurica/Documents/workspace/eclipse/TechDashboard/xpathModels/'+str(self.__domainDBkey)+'_NoAttrib.pickle', 'rb')) #=================================================================== except PicklingError, e: print e self.__htmlElements = [ 'body', 'header', 'nav', 'footer', 'article', 'section', 'aside', 'div', 'span' ] self.__htmlAttributes = ['id', 'class'] self.__documentIDKey = '' self.__utilitiesFunctions = utilities() #DB CONNECTIVITY AND FUNCTIONALITY self.__db = connectMySQL(db='xpath', port=3366) self.__topicModel = techDashTopicModel(destination=pwd + '/modelsLDA/', fileName='fullModel', modelName='fullModel_100P_20T') #======================================================================= # OPEN URL #======================================================================= url2Open, self.__htmlFile = self.__utilitiesFunctions.openULR( self.__fileURL) #======================================================================= # NER #======================================================================= self.__extractNerStanford = CoreNLPner self.__extractNerSpacy = spacyNER
def __init__(self, html4Elements=__html4Elements, html5Elements=__html5Elements, doctype=__doctype, url=__url, fileName=__fileName, domain=__domain, domainDBkey=__domainDBkey): #URL TO EXTRACT XPATH AND CONTENT FROM self.__url = url #BASED ON DOCTYPE, DIFFERENT SET OF STRUCTURE HTML ELEMENTS ARE USED self.__doctype = doctype self.__html4Elements = html4Elements self.__html5Elements = html5Elements self.__structureElements = None #INITAL EMPTY DICTIONARY FOR INITAL STAGE OF PARSING/EXTRACTING self.__domDict = {} #DB CONNECTIVITY AND FUNCTIONALITY self.__db = connectMySQL(db='xpath', port=3366) #print dir(self.__db) #domain information self.__domain = domain self.__domainDBkey = domainDBkey self.__documentIDKey = None #CSV file operations - DISCARDED FOR NOW, SWITHING TO MYSQL OPERATION self.__fileName = fileName try: self.__f = open(self.__fileName, "a") self.__spamwriter = csv.writer(self.__f, delimiter=';', quotechar='/', quoting=csv.QUOTE_NONE) except: print("There is no file named", self.__filename) #GET DB KEY FOR CURRENT DOMAIN OR INSERT IN TO DB IF CONTENT FROM TAHT DOMAIN NOT YET ANALYZED self.getDomainKey() #======================================================================= # START CREATING DOM TREE #======================================================================= self.readDOMrecursive()
def __init__(self, destination, fileName, modelName='', ldaPasses='', topicNum=''): ''' Constructor ''' logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) self.__destination = destination self.__fileName = fileName self.__modelName = modelName self.__ldaPasses = ldaPasses self.__topicNum = topicNum #======================================================================= # STOP WORDS AND CAHRACTERS #======================================================================= self.__stopwords = stopwords.words('english') # + string.punctuation self.__chars_to_remove = [ u'[', u']', u'(', u')', u'*', u'%', u'{', u'}', u'\n', u'\n\n', u'\t', u';', u'/', u'^', u'--', u'\\', u'+', u'-', u'.', u'?', u'&', u'#', u'', u'' ] self.__stopwords.extend(self.__chars_to_remove) self.__stopwords.extend([item for item in string.punctuation]) #======================================================================= # DATABASE #======================================================================= self.__db = connectMySQL(db='xpath', port=3366) self.__queryResults = None self.__cleanedCorpus = [] if modelName != '' and os.path.exists(self.__destination + modelName + '.lda'): self.__ldaModel = LdaModel.load(self.__destination + modelName + '.lda', mmap='r') if fileName != '' and os.path.exists(self.__destination + fileName + '.dict'): self.__modelDict = corpora.Dictionary.load(self.__destination + fileName + '.dict')
def __init__( self, html4Elements=__html4Elements, html5Elements=__html5Elements, doctype=__doctype, url=__url, fileName=__fileName, domain=__domain, domainDBkey=__domainDBkey, ): # URL TO EXTRACT XPATH AND CONTENT FROM self.__url = url # BASED ON DOCTYPE, DIFFERENT SET OF STRUCTURE HTML ELEMENTS ARE USED self.__doctype = doctype self.__html4Elements = html4Elements self.__html5Elements = html5Elements self.__structureElements = None # INITAL EMPTY DICTIONARY FOR INITAL STAGE OF PARSING/EXTRACTING self.__domDict = {} # DB CONNECTIVITY AND FUNCTIONALITY self.__db = connectMySQL(db="xpath", port=3366) # print dir(self.__db) # domain information self.__domain = domain self.__domainDBkey = domainDBkey self.__documentIDKey = None # CSV file operations - DISCARDED FOR NOW, SWITHING TO MYSQL OPERATION self.__fileName = fileName try: self.__f = open(self.__fileName, "a") self.__spamwriter = csv.writer(self.__f, delimiter=";", quotechar="/", quoting=csv.QUOTE_NONE) except: print ("There is no file named", self.__filename) # GET DB KEY FOR CURRENT DOMAIN OR INSERT IN TO DB IF CONTENT FROM TAHT DOMAIN NOT YET ANALYZED self.getDomainKey() # ======================================================================= # START CREATING DOM TREE # ======================================================================= self.readDOMrecursive()
''' Created on 27 Nov 2015 @author: jurica ''' from mysqlUtilities import connectMySQL from spacy.en import English, LOCAL_DATA_DIR db = connectMySQL(db='xpath', port=3366) sqlQuery = "SELECT xpathValuesID, xpathValuesXPathNER FROM xpath.xpathValuesXPath where xpathValuesXPathNER like '%[%]' order by xpathValuesXPathDateTime desc" db.executeQuery(sqlQuery) for item in db._connectMySQL__results: #=========================================================================== # print item[1], type(item[1]) #=========================================================================== #=========================================================================== # print len(item[1]), item[1] #=========================================================================== print '========' if len(item[1])>2: tempList = [] for tem in item[1].split(','): stripped = tem.strip('"').strip("'").strip('[').strip("]") stripped = stripped.replace("'","") stripped = stripped.replace('"','') tempList.append(stripped) nerEntities = ",".join(list(set(tempList))) print nerEntities else: nerEntities = 'No NERs Recognized'
from tweepy import Stream from tweepy import OAuthHandler from tweepy.streaming import StreamListener import json from mysqlUtilities import connectMySQL from pprint import pprint import time import traceback # tags = ['izboriRH','izboriHR','izbori2015','izbori'] tags = ["izboriRH"] from mysqlUtilities import connectMySQL db = connectMySQL(db="crolections", port=3366) # consumer key, consumer secret, access token, access secret. ckey = "YaV9e065RrS7GDG7ZPOeCHl3c" csecret = "ttfJ3oaGuViY2QnrB4AjtY259ow5uDef5CiK0GMOTVQW1kNDyF" atoken = "2926971581-qhxdUJThotm8Jpmy9Ks5P2XWivDcYSeCtvaDCpj" asecret = "eLd0UWqoFt5riom4hzKukk1jVrpK6zFdm5dPLVSJnfnz9" class listener(StreamListener): def on_data(self, data): try: all_data = json.loads(data) # =================================================================== # pprint(all_data)
#=============================================================================== # http://stackoverflow.com/questions/28267640/tweepy-get-old-tweets-now-possible-with-twitter-search-api # how to get old tweets #=============================================================================== from tweepy.models import Status import tweepy from pprint import pprint import json import time from mysqlUtilities import connectMySQL db = connectMySQL(db='crolections', port=3366) ckey = "YaV9e065RrS7GDG7ZPOeCHl3c" csecret = "ttfJ3oaGuViY2QnrB4AjtY259ow5uDef5CiK0GMOTVQW1kNDyF" atoken = "2926971581-qhxdUJThotm8Jpmy9Ks5P2XWivDcYSeCtvaDCpj" asecret = "eLd0UWqoFt5riom4hzKukk1jVrpK6zFdm5dPLVSJnfnz9" auth = tweepy.OAuthHandler(ckey, csecret) auth.set_access_token(atoken, asecret) api = tweepy.API(auth) query = '#izboriRH OR #izborirh OR #izbori2015 OR #izbori15 OR #politikahr' for all_data in tweepy.Cursor(api.search, q=query).items(): sqlExists = 'select count(*) from izbori2015 where izbori2015_tweet_id = %d'%(all_data.id) db.executeQuery(sqlExists) #=========================================================================== # print sqlExists
''' Created on 27 Nov 2015 @author: jurica ''' from mysqlUtilities import connectMySQL from spacy.en import English, LOCAL_DATA_DIR db = connectMySQL(db='xpath', port=3366) sqlQuery = "SELECT xpathValuesID, xpathValuesXPathNER FROM xpath.xpathValuesXPath where xpathValuesXPathNER like '%[%]' order by xpathValuesXPathDateTime desc" db.executeQuery(sqlQuery) for item in db._connectMySQL__results: #=========================================================================== # print item[1], type(item[1]) #=========================================================================== #=========================================================================== # print len(item[1]), item[1] #=========================================================================== print '========' if len(item[1]) > 2: tempList = [] for tem in item[1].split(','): stripped = tem.strip('"').strip("'").strip('[').strip("]") stripped = stripped.replace("'", "") stripped = stripped.replace('"', '') tempList.append(stripped) nerEntities = ",".join(list(set(tempList))) print nerEntities else: nerEntities = 'No NERs Recognized'
from tweepy import Stream from tweepy import OAuthHandler from tweepy.streaming import StreamListener import json from mysqlUtilities import connectMySQL from pprint import pprint import time import traceback #tags = ['izboriRH','izboriHR','izbori2015','izbori'] tags = ['izboriRH'] from mysqlUtilities import connectMySQL db = connectMySQL(db='crolections', port=3366) #consumer key, consumer secret, access token, access secret. ckey = "YaV9e065RrS7GDG7ZPOeCHl3c" csecret = "ttfJ3oaGuViY2QnrB4AjtY259ow5uDef5CiK0GMOTVQW1kNDyF" atoken = "2926971581-qhxdUJThotm8Jpmy9Ks5P2XWivDcYSeCtvaDCpj" asecret = "eLd0UWqoFt5riom4hzKukk1jVrpK6zFdm5dPLVSJnfnz9" class listener(StreamListener): def on_data(self, data): try: all_data = json.loads(data) #=================================================================== # pprint(all_data)