Beispiel #1
0
def insertNewContentToDict(localFilename, page_redirected, page_info, page_sha254, page_status, commTimeDir):
    try:
        newDict = dict()
        newDict[page_sha254] = page_info
        newDict[page_sha254]["localFilename"] = localFilename
        newDict[page_sha254]["file_url"] = page_redirected
        newDict[page_sha254]["sha224"] = page_sha254
        newDict[page_sha254]["status"] = page_status
        newDict[page_sha254]["timeDir"] = commTimeDir
        return newDict
    except:
        comm.printException(comm.pathToSaveJsonErrors)
        pass
Beispiel #2
0
def insertNewContentToDict(localFilename, page_redirected, page_info,
                           page_sha254, page_status, commTimeDir):
    try:
        newDict = dict()
        newDict[page_sha254] = page_info
        newDict[page_sha254]["localFilename"] = localFilename
        newDict[page_sha254]["file_url"] = page_redirected
        newDict[page_sha254]["sha224"] = page_sha254
        newDict[page_sha254]["status"] = page_status
        newDict[page_sha254]["timeDir"] = commTimeDir
        return newDict
    except:
        comm.printException(comm.pathToSaveJsonErrors)
        pass
Beispiel #3
0
def main(argv):
    #load json-formatted meta data into dictionary
    aadress = json.loads(argv["address"])
    _BUCKET_NAME = aadress["bucket"].encode()
    _FILE1_NAME = aadress["object"].encode()
    jsondata = argv["jsondata"].encode()
    localFilename = argv["localFilename"].encode(
    )  #SHA of url for local storage
    commTimeDir = argv["timeDir"].encode(
    )  #current time when the file was accessed
    redirectedTo = argv["redirectedTo"].encode()
    url = redirectedTo
    pageInfo = argv["pageInfo"]  #dict object
    sha224_ = argv["sha224_"].encode()
    statusCode = argv["statusCode"]  #int object

    #google-python-api code, get access to storage bucket
    http = httplib2.Http()
    token_uri = '%s/%s/token' % (METADATA_SERVER, SERVICE_ACCOUNT)
    resp, content = http.request(token_uri,
                                 method='GET',
                                 body=None,
                                 headers={'Metadata-Flavor': 'Google'})
    #google-python-api code, authenticate credentials
    if resp.status == 200:
        d = json.loads(content)
        access_token = d['access_token']  # Save the access token
        credentials = oauth2_client.AccessTokenCredentials(
            access_token, 'my-user-agent/1.0')
        client = api_discovery.build('storage',
                                     _API_VERSION,
                                     http=credentials.authorize(http))

        #save info about whether to extract new entities or not
        someNewData = False
        #try to open existing object
        try:
            #google-python-api code,  Get Metadata of bucket object
            req = client.objects().get(bucket=_BUCKET_NAME,
                                       object=_FILE1_NAME)  # optional
            try:
                resp = req.execute()
            #if it was impossible to open this object, it means
            #this object does not exist yes, therefore this doc is 'new data'
            except HttpError:
                someNewData = True
            except:
                someNewData = True

            #in the case, when object exists,
            #it should be checked whether its content had changed or not
            if (someNewData is False
                ):  #there is a file with this name in this bucket already
                #google-python-api code, Get Payload Data: get object in bucket
                req = client.objects().get_media(
                    bucket=_BUCKET_NAME, object=_FILE1_NAME)  # optional
                # The BytesIO object may be replaced with any io.Base instance.
                fh = io.BytesIO()
                #google-python-api code
                downloader = MediaIoBaseDownload(fh,
                                                 req,
                                                 chunksize=1024 * 1024)
                done = False
                #download object
                while not done:
                    status, done = downloader.next_chunk()
                #status: an object is downloaded

                #load json-object into dict
                existingFileDict = json.loads(
                    fh.getvalue())  #return dict()-type value
                '''
                #Debugging
                #print ("RETURNED VALUE: " + existingFileDict)
                print ("STR ")
                print (type(existingFileDict) is str)--false
                print ("DICT ")
                print (type(existingFileDict) is dict)--true
                print ("LIST ")
                print (type(existingFileDict) is list)--false
                '''
                #'existingFileDict' is a dict of downloaded object
                #'base_url' is a host name of current web document
                #if url (file name of current web document) contains 'base_url',
                #it means that this doc may already processed
                if (existingFileDict['base_url']
                        in url):  #same host name was requested
                    #dict has two 1-level keys: 'base_url' and sha244 of a file name
                    #check if this doc is also processed earlier:
                    #get list of hashes of filenames in this (downloaded/already existing) json-object
                    fNameKey = [
                        k for k in existingFileDict.keys() if k != 'base_url'
                    ]
                    #this list may contain 'localFilename'
                    #'localFilename' is a hash of fileUrl of a current document
                    #true if current fileName in the list of existing filenames of a host
                    if (localFilename in fNameKey):
                        #if earlier saved file's content sha does not equal to current doc content sha,
                        #the contents of file has changed.
                        #Saved (downloaded/already existing) file's sha is saved into key, find it or not:
                        shaKeys = existingFileDict[localFilename].keys()
                        #true if current doc's content hash is not found
                        if (sha224_ not in shaKeys):  #file has changed
                            #search for date, if it same, update existing sha key
                            #this for avoiding redundant info
                            replaceSha = ""
                            #if there is same timedDir under some sha key, get this sha and replace
                            #loop over all content hashes (these are keys in json-file)
                            for sk in shaKeys:
                                savedDate = existingFileDict[localFilename][
                                    sk]["timeDir"]
                                #compare saved date to current date
                                if (savedDate == commTimeDir
                                    ):  #same date has found
                                    replaceSha = sk
                                    break  # no need to search further
                            #true if the same date was found
                            if (replaceSha !=
                                    ""):  #delete sha, because of same day date
                                del existingFileDict[localFilename][replaceSha]
                            #add new value with new content_sha-key under filename_sha-key
                            #filename-url is same, but content is changed
                            #so add/update new content-sha
                            newDataDict = insertNewContentToDict(
                                localFilename, redirectedTo, pageInfo, sha224_,
                                statusCode, commTimeDir)
                            if (newDataDict):
                                existingFileDict[localFilename].update(
                                    newDataDict)
                                updateExistingObj(existingFileDict, client,
                                                  _BUCKET_NAME, _FILE1_NAME)
                                someNewData = True

                    #current fileName was NOT in the list of existing filenames of a host
                    else:  #new file (resource) from same domain (or 'base_url') requested
                        #add new value with new filename_sha-key for that base-resource
                        newDataDict = insertNewSubFileToDict(
                            localFilename, redirectedTo, pageInfo, sha224_,
                            statusCode, commTimeDir)
                        if (newDataDict):
                            existingFileDict.update(newDataDict)
                            updateExistingObj(existingFileDict, client,
                                              _BUCKET_NAME, _FILE1_NAME)
                            someNewData = True
            #current host was NOT accessed/processed so far
            else:  #inserts new file into bucket
                insertObj.insertNewObject(client, _BUCKET_NAME, _FILE1_NAME,
                                          jsondata)

            #return info about whether to send doc to parser and extract new entities or not
            print(someNewData)
        except oauth2_client.AccessTokenRefreshError:
            errstr = (
                "oauth2_client.AccessTokenRefreshError_False_credentials")
            comm.printException(comm.pathToSaveJsonErrors, errstr)
            pass
    else:
        errstr = "Cannot_access_google_storage_" + (str(False) +
                                                    str(resp.status))
        comm.printException(comm.pathToSaveJsonErrors, errstr)
        pass
Beispiel #4
0
def main(argv):
    #load json-formatted meta data into dictionary
    aadress = json.loads(argv["address"])
    _BUCKET_NAME = aadress["bucket"].encode()
    _FILE1_NAME = aadress["object"].encode()
    jsondata = argv["jsondata"].encode()
    localFilename = argv["localFilename"].encode() #SHA of url for local storage
    commTimeDir = argv["timeDir"].encode() #current time when the file was accessed
    redirectedTo=argv["redirectedTo"].encode()
    url = redirectedTo
    pageInfo = argv["pageInfo"] #dict object
    sha224_ = argv["sha224_"].encode()
    statusCode = argv["statusCode"] #int object
    
    #google-python-api code, get access to storage bucket
    http = httplib2.Http()
    token_uri = '%s/%s/token' % (METADATA_SERVER, SERVICE_ACCOUNT)
    resp, content = http.request(token_uri, method='GET',
                                 body=None,
                                 headers={'Metadata-Flavor': 'Google'})
    #google-python-api code, authenticate credentials
    if resp.status == 200:
        d = json.loads(content)
        access_token = d['access_token']  # Save the access token
        credentials = oauth2_client.AccessTokenCredentials(access_token, 'my-user-agent/1.0')
        client = api_discovery.build('storage', _API_VERSION, http=credentials.authorize(http))

        #save info about whether to extract new entities or not
        someNewData = False
        #try to open existing object
        try:
            #google-python-api code,  Get Metadata of bucket object
            req = client.objects().get(
                    bucket=_BUCKET_NAME,
                    object=_FILE1_NAME) # optional
            try:
                resp = req.execute()
            #if it was impossible to open this object, it means
            #this object does not exist yes, therefore this doc is 'new data'
            except HttpError:
                someNewData = True
            except:
                someNewData = True
            
            #in the case, when object exists,
            #it should be checked whether its content had changed or not
            if (someNewData is False):#there is a file with this name in this bucket already
                #google-python-api code, Get Payload Data: get object in bucket
                req = client.objects().get_media(
                    bucket=_BUCKET_NAME,
                    object=_FILE1_NAME)    # optional
                # The BytesIO object may be replaced with any io.Base instance.
                fh = io.BytesIO()
                #google-python-api code
                downloader = MediaIoBaseDownload(fh, req, chunksize=1024*1024)
                done = False
                #download object
                while not done:
                    status, done = downloader.next_chunk()
                #status: an object is downloaded
                
                #load json-object into dict
                existingFileDict = json.loads(fh.getvalue())#return dict()-type value
                '''
                #Debugging
                #print ("RETURNED VALUE: " + existingFileDict)
                print ("STR ")
                print (type(existingFileDict) is str)--false
                print ("DICT ")
                print (type(existingFileDict) is dict)--true
                print ("LIST ")
                print (type(existingFileDict) is list)--false
                '''
                #'existingFileDict' is a dict of downloaded object
                #'base_url' is a host name of current web document
                #if url (file name of current web document) contains 'base_url',
                #it means that this doc may already processed
                if(existingFileDict['base_url'] in url):#same host name was requested
                    #dict has two 1-level keys: 'base_url' and sha244 of a file name
                    #check if this doc is also processed earlier:
                    #get list of hashes of filenames in this (downloaded/already existing) json-object
                    fNameKey = [k for k in existingFileDict.keys() if k != 'base_url']
                    #this list may contain 'localFilename'
                    #'localFilename' is a hash of fileUrl of a current document
                    #true if current fileName in the list of existing filenames of a host
                    if (localFilename in fNameKey):
                        #if earlier saved file's content sha does not equal to current doc content sha, 
                        #the contents of file has changed.
                        #Saved (downloaded/already existing) file's sha is saved into key, find it or not:
                        shaKeys = existingFileDict[localFilename].keys()
                        #true if current doc's content hash is not found
                        if(sha224_ not in shaKeys):#file has changed
                            #search for date, if it same, update existing sha key
                            #this for avoiding redundant info
                            replaceSha = ""
                            #if there is same timedDir under some sha key, get this sha and replace
                            #loop over all content hashes (these are keys in json-file)
                            for sk in shaKeys:
                                savedDate = existingFileDict[localFilename][sk]["timeDir"]
                                #compare saved date to current date
                                if(savedDate == commTimeDir):#same date has found
                                    replaceSha = sk
                                    break # no need to search further
                            #true if the same date was found
                            if(replaceSha != ""):#delete sha, because of same day date
                                del existingFileDict[localFilename][replaceSha]
                            #add new value with new content_sha-key under filename_sha-key
                            #filename-url is same, but content is changed
                            #so add/update new content-sha
                            newDataDict = insertNewContentToDict(localFilename, redirectedTo, pageInfo, sha224_, statusCode, commTimeDir)
                            if(newDataDict):
                                existingFileDict[localFilename].update(newDataDict)
                                updateExistingObj(existingFileDict, client, _BUCKET_NAME, _FILE1_NAME)
                                someNewData = True
                            
                    #current fileName was NOT in the list of existing filenames of a host
                    else:#new file (resource) from same domain (or 'base_url') requested
                        #add new value with new filename_sha-key for that base-resource
                        newDataDict = insertNewSubFileToDict(localFilename, redirectedTo, pageInfo, sha224_, statusCode, commTimeDir)
                        if(newDataDict):
                            existingFileDict.update(newDataDict)
                            updateExistingObj(existingFileDict, client, _BUCKET_NAME, _FILE1_NAME)
                            someNewData = True
            #current host was NOT accessed/processed so far
            else:#inserts new file into bucket
                insertObj.insertNewObject(client, _BUCKET_NAME, _FILE1_NAME, jsondata)

            #return info about whether to send doc to parser and extract new entities or not
            print (someNewData)
        except oauth2_client.AccessTokenRefreshError:
            errstr = ("oauth2_client.AccessTokenRefreshError_False_credentials")
            comm.printException(comm.pathToSaveJsonErrors, errstr)
            pass
    else:
        errstr = "Cannot_access_google_storage_" + (str(False) + str(resp.status))
        comm.printException(comm.pathToSaveJsonErrors, errstr)
        pass