def __init__(self, destination, fileName, modelName='', ldaPasses='', topicNum=''):
        '''
        Constructor
        '''
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
        
        self.__destination = destination
        self.__fileName = fileName
        self.__modelName = modelName
        self.__ldaPasses = ldaPasses
        self.__topicNum = topicNum
                
        #=======================================================================
        # STOP WORDS AND CAHRACTERS
        #=======================================================================
        self.__stopwords = stopwords.words('english')# + string.punctuation
        self.__chars_to_remove = [u'[', u']', u'(', u')', u'*', u'%', u'{', u'}', u'\n', u'\n\n', u'\t', u';',u'/',u'^',u'--',u'\\',u'+',u'-',u'.',u'?',u'&',u'#',u'',u'']
        self.__stopwords.extend(self.__chars_to_remove)
        self.__stopwords.extend([item for item in string.punctuation])

        #=======================================================================
        # DATABASE
        #=======================================================================
        self.__db = connectMySQL(db='xpath', port=3366)
        self.__queryResults = None
        self.__cleanedCorpus = []
        

        if modelName != '' and os.path.exists(self.__destination+modelName+'.lda'):
            self.__ldaModel = LdaModel.load(self.__destination+modelName+'.lda', mmap='r') 
            
        if fileName != '' and os.path.exists(self.__destination+fileName+'.dict'):
            self.__modelDict = corpora.Dictionary.load(self.__destination+fileName+'.dict')
Exemple #2
0
class ContentExtractor(object):
    '''
    classdocs
    '''
    '''
    steps:
        get serialized json file
    '''
    @profile
    def __init__(self,
                 domain,
                 htmlFileURL,
                 pwd,
                 CoreNLPner='',
                 spacyNER='',
                 dbConnection=''):
        '''
        Constructor
        '''
        self.__fileURL = htmlFileURL
        self.__domainDBkey = domain

        try:
            self.__XpathList = pickle.load(
                open(
                    pwd + '/xpathModels/' + str(self.__domainDBkey) +
                    '.pickle', 'rb'))
            #===================================================================
            # self.__XpathListID = pickle.load(open('/Users/jurica/Documents/workspace/eclipse/TechDashboard/xpathModels/'+str(self.__domainDBkey)+'_ID.pickle', 'rb'))
            # self.__XpathListNoAttrib = pickle.load(open('/Users/jurica/Documents/workspace/eclipse/TechDashboard/xpathModels/'+str(self.__domainDBkey)+'_NoAttrib.pickle', 'rb'))
            #===================================================================
        except PicklingError, e:
            print e

        self.__htmlElements = [
            'body', 'header', 'nav', 'footer', 'article', 'section', 'aside',
            'div', 'span'
        ]
        self.__htmlAttributes = ['id', 'class']
        self.__documentIDKey = ''
        self.__utilitiesFunctions = utilities()

        #DB CONNECTIVITY AND FUNCTIONALITY
        self.__db = connectMySQL(db='xpath', port=3366)
        self.__topicModel = techDashTopicModel(destination=pwd + '/modelsLDA/',
                                               fileName='fullModel',
                                               modelName='fullModel_100P_20T')

        #=======================================================================
        # OPEN URL
        #=======================================================================
        url2Open, self.__htmlFile = self.__utilitiesFunctions.openULR(
            self.__fileURL)

        #=======================================================================
        # NER
        #=======================================================================
        self.__extractNerStanford = CoreNLPner
        self.__extractNerSpacy = spacyNER
Exemple #3
0
    def __init__(self,
                 html4Elements=__html4Elements,
                 html5Elements=__html5Elements,
                 doctype=__doctype,
                 url=__url,
                 fileName=__fileName,
                 domain=__domain,
                 domainDBkey=__domainDBkey):
        #URL TO EXTRACT XPATH AND CONTENT FROM
        self.__url = url

        #BASED ON DOCTYPE, DIFFERENT SET OF STRUCTURE HTML ELEMENTS ARE USED
        self.__doctype = doctype
        self.__html4Elements = html4Elements
        self.__html5Elements = html5Elements
        self.__structureElements = None

        #INITAL EMPTY DICTIONARY FOR INITAL STAGE OF PARSING/EXTRACTING
        self.__domDict = {}

        #DB CONNECTIVITY AND FUNCTIONALITY
        self.__db = connectMySQL(db='xpath', port=3366)
        #print dir(self.__db)

        #domain information
        self.__domain = domain
        self.__domainDBkey = domainDBkey
        self.__documentIDKey = None

        #CSV file operations - DISCARDED FOR NOW, SWITHING TO MYSQL OPERATION
        self.__fileName = fileName
        try:
            self.__f = open(self.__fileName, "a")
            self.__spamwriter = csv.writer(self.__f,
                                           delimiter=';',
                                           quotechar='/',
                                           quoting=csv.QUOTE_NONE)
        except:
            print("There is no file named", self.__filename)

        #GET DB KEY FOR CURRENT DOMAIN OR INSERT IN TO DB IF CONTENT FROM TAHT DOMAIN NOT YET ANALYZED
        self.getDomainKey()

        #=======================================================================
        # START CREATING DOM TREE
        #=======================================================================
        self.readDOMrecursive()
    def __init__(self,
                 destination,
                 fileName,
                 modelName='',
                 ldaPasses='',
                 topicNum=''):
        '''
        Constructor
        '''
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                            level=logging.INFO)

        self.__destination = destination
        self.__fileName = fileName
        self.__modelName = modelName
        self.__ldaPasses = ldaPasses
        self.__topicNum = topicNum

        #=======================================================================
        # STOP WORDS AND CAHRACTERS
        #=======================================================================
        self.__stopwords = stopwords.words('english')  # + string.punctuation
        self.__chars_to_remove = [
            u'[', u']', u'(', u')', u'*', u'%', u'{', u'}', u'\n', u'\n\n',
            u'\t', u';', u'/', u'^', u'--', u'\\', u'+', u'-', u'.', u'?',
            u'&', u'#', u'', u''
        ]
        self.__stopwords.extend(self.__chars_to_remove)
        self.__stopwords.extend([item for item in string.punctuation])

        #=======================================================================
        # DATABASE
        #=======================================================================
        self.__db = connectMySQL(db='xpath', port=3366)
        self.__queryResults = None
        self.__cleanedCorpus = []

        if modelName != '' and os.path.exists(self.__destination + modelName +
                                              '.lda'):
            self.__ldaModel = LdaModel.load(self.__destination + modelName +
                                            '.lda',
                                            mmap='r')

        if fileName != '' and os.path.exists(self.__destination + fileName +
                                             '.dict'):
            self.__modelDict = corpora.Dictionary.load(self.__destination +
                                                       fileName + '.dict')
Exemple #5
0
    def __init__(
        self,
        html4Elements=__html4Elements,
        html5Elements=__html5Elements,
        doctype=__doctype,
        url=__url,
        fileName=__fileName,
        domain=__domain,
        domainDBkey=__domainDBkey,
    ):
        # URL TO EXTRACT XPATH AND CONTENT FROM
        self.__url = url

        # BASED ON DOCTYPE, DIFFERENT SET OF STRUCTURE HTML ELEMENTS ARE USED
        self.__doctype = doctype
        self.__html4Elements = html4Elements
        self.__html5Elements = html5Elements
        self.__structureElements = None

        # INITAL EMPTY DICTIONARY FOR INITAL STAGE OF PARSING/EXTRACTING
        self.__domDict = {}

        # DB CONNECTIVITY AND FUNCTIONALITY
        self.__db = connectMySQL(db="xpath", port=3366)
        # print dir(self.__db)

        # domain information
        self.__domain = domain
        self.__domainDBkey = domainDBkey
        self.__documentIDKey = None

        # CSV file operations - DISCARDED FOR NOW, SWITHING TO MYSQL OPERATION
        self.__fileName = fileName
        try:
            self.__f = open(self.__fileName, "a")
            self.__spamwriter = csv.writer(self.__f, delimiter=";", quotechar="/", quoting=csv.QUOTE_NONE)
        except:
            print ("There is no file named", self.__filename)

        # GET DB KEY FOR CURRENT DOMAIN OR INSERT IN TO DB IF CONTENT FROM TAHT DOMAIN NOT YET ANALYZED
        self.getDomainKey()

        # =======================================================================
        # START CREATING DOM TREE
        # =======================================================================
        self.readDOMrecursive()
Exemple #6
0
'''
Created on 27 Nov 2015

@author: jurica
'''
from mysqlUtilities import connectMySQL
from spacy.en import English, LOCAL_DATA_DIR

db = connectMySQL(db='xpath', port=3366)
sqlQuery = "SELECT xpathValuesID, xpathValuesXPathNER FROM xpath.xpathValuesXPath where xpathValuesXPathNER like '%[%]' order by xpathValuesXPathDateTime desc"

db.executeQuery(sqlQuery)
for item in db._connectMySQL__results:
    #===========================================================================
    # print item[1], type(item[1])
    #===========================================================================
    #===========================================================================
    # print len(item[1]), item[1]
    #===========================================================================
    print '========'
    if len(item[1])>2:
        tempList = []
        for tem in item[1].split(','):
            stripped = tem.strip('"').strip("'").strip('[').strip("]")
            stripped = stripped.replace("'","")
            stripped = stripped.replace('"','')
            tempList.append(stripped)
            nerEntities = ",".join(list(set(tempList)))
            print nerEntities
    else:
        nerEntities = 'No NERs Recognized'
Exemple #7
0
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import json
from mysqlUtilities import connectMySQL
from pprint import pprint
import time
import traceback

# tags = ['izboriRH','izboriHR','izbori2015','izbori']
tags = ["izboriRH"]

from mysqlUtilities import connectMySQL

db = connectMySQL(db="crolections", port=3366)

# consumer key, consumer secret, access token, access secret.
ckey = "YaV9e065RrS7GDG7ZPOeCHl3c"
csecret = "ttfJ3oaGuViY2QnrB4AjtY259ow5uDef5CiK0GMOTVQW1kNDyF"
atoken = "2926971581-qhxdUJThotm8Jpmy9Ks5P2XWivDcYSeCtvaDCpj"
asecret = "eLd0UWqoFt5riom4hzKukk1jVrpK6zFdm5dPLVSJnfnz9"


class listener(StreamListener):
    def on_data(self, data):

        try:

            all_data = json.loads(data)
            # ===================================================================
            # pprint(all_data)
#===============================================================================
# http://stackoverflow.com/questions/28267640/tweepy-get-old-tweets-now-possible-with-twitter-search-api
# how to get old tweets 
#===============================================================================

from tweepy.models import Status
import tweepy
from pprint import pprint
import json
import time
from mysqlUtilities import connectMySQL

db = connectMySQL(db='crolections', port=3366)

ckey = "YaV9e065RrS7GDG7ZPOeCHl3c"
csecret = "ttfJ3oaGuViY2QnrB4AjtY259ow5uDef5CiK0GMOTVQW1kNDyF"
atoken = "2926971581-qhxdUJThotm8Jpmy9Ks5P2XWivDcYSeCtvaDCpj"
asecret = "eLd0UWqoFt5riom4hzKukk1jVrpK6zFdm5dPLVSJnfnz9"

auth = tweepy.OAuthHandler(ckey, csecret)
auth.set_access_token(atoken, asecret)
api = tweepy.API(auth)
query = '#izboriRH OR #izborirh OR #izbori2015 OR #izbori15 OR #politikahr'


for all_data in tweepy.Cursor(api.search, q=query).items():
     
    sqlExists = 'select count(*) from izbori2015 where izbori2015_tweet_id = %d'%(all_data.id)
    db.executeQuery(sqlExists)
    #===========================================================================
    # print sqlExists
Exemple #9
0
'''
Created on 27 Nov 2015

@author: jurica
'''
from mysqlUtilities import connectMySQL
from spacy.en import English, LOCAL_DATA_DIR

db = connectMySQL(db='xpath', port=3366)
sqlQuery = "SELECT xpathValuesID, xpathValuesXPathNER FROM xpath.xpathValuesXPath where xpathValuesXPathNER like '%[%]' order by xpathValuesXPathDateTime desc"

db.executeQuery(sqlQuery)
for item in db._connectMySQL__results:
    #===========================================================================
    # print item[1], type(item[1])
    #===========================================================================
    #===========================================================================
    # print len(item[1]), item[1]
    #===========================================================================
    print '========'
    if len(item[1]) > 2:
        tempList = []
        for tem in item[1].split(','):
            stripped = tem.strip('"').strip("'").strip('[').strip("]")
            stripped = stripped.replace("'", "")
            stripped = stripped.replace('"', '')
            tempList.append(stripped)
            nerEntities = ",".join(list(set(tempList)))
            print nerEntities
    else:
        nerEntities = 'No NERs Recognized'
Exemple #10
0
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import json
from mysqlUtilities import connectMySQL
from pprint import pprint
import time
import traceback

#tags = ['izboriRH','izboriHR','izbori2015','izbori']
tags = ['izboriRH']

from mysqlUtilities import connectMySQL
db = connectMySQL(db='crolections', port=3366)

#consumer key, consumer secret, access token, access secret.
ckey = "YaV9e065RrS7GDG7ZPOeCHl3c"
csecret = "ttfJ3oaGuViY2QnrB4AjtY259ow5uDef5CiK0GMOTVQW1kNDyF"
atoken = "2926971581-qhxdUJThotm8Jpmy9Ks5P2XWivDcYSeCtvaDCpj"
asecret = "eLd0UWqoFt5riom4hzKukk1jVrpK6zFdm5dPLVSJnfnz9"


class listener(StreamListener):

    def on_data(self, data):
        
        try:
            
            all_data = json.loads(data)
            #===================================================================
            # pprint(all_data)