Ejemplo n.º 1
0
def fill_rowObj_fromFile(fullpath, rowObj, args):
    '''
    fills rowObj with info from file
    '''
    file = os.path.basename(fullpath)
    duration = rowObj['data']['duration']
    loggr(duration)
    print(duration)
    if not duration:
        loggr("no duration in catalog, getting duration of " + file)
        print("no duration in catalog, getting duration of " + file)
        rowObj.data.duration = get_file_duration(fullpath)
    if "nas" in fullpath or "NAS" in fullpath:
        whichHash = 'SHA1hash-onRAID'
    else:
        whichHash = 'SHA1hash-ondrive'
    hash = rowObj.data[whichHash]
    if not hash:
        loggr("no hash in catalog, hashing " + file)
        print("no hash in catalog, hashing " + file)
        rowObj.data[whichHash] = hash_file(fullpath)
        if not rowObj.data[whichHash]:
            loggr("hash error")
            print("hash error")
    uid = rowObj.identifier
    if not uid:
        loggr(
            "identifier not in rowObj, attempting to locate uid in filename in mfd.fill_rowObj_fromFile()"
        )
        print(
            "identifier not in rowObj, attempting to locate uid in filename in mfd.fill_rowObj_fromFile()"
        )
        rowObj.identifier = get_uid_from_file(fullpath)
    return rowObj
Ejemplo n.º 2
0
def make_rowObj(args):
    '''
    use make_rowObject to make a rowObj
    '''
    rowObj, header_map = make_rowObject.init_rowObject(args)
    rowObj = make_rowObject.fill_rowObject_fromCatalog(rowObj, header_map,
                                                       args)
    loggr(rowObj)
    return rowObj
Ejemplo n.º 3
0
def clean_header_row(header_row):
    '''
    removes list objects that we don't need like nameroles
    '''
    loggr("cleaning header_row in mro.clean_header_row()")
    print("cleaning header_row in mro.clean_header_row()")
    _header_row = []
    for header in header_row:
        if not header == "name" and not header == "role" and not header == "hashes match?" and not header == "showHide" and not header == "identifier":
            _header_row.append(header)
    loggr("header_row_cleaning complete in mro.clean_header_row()")
    pprint("header_row_cleaning complete in mro.clean_header_row()")
    return _header_row
Ejemplo n.º 4
0
def get_uid_from_file(filepath):
    '''
    extracts 5000x number from full path
    '''
    loggr("attempting to locate uid in file name in mfd.get_uid_from_file")
    print("attempting to locate uid in file name in mfd.get_uid_from_file")
    match = ''
    match = re.search(r'\d{14}', filepath)
    if match:
        uid = match.group()
        print("uid is " + uid)
        return uid
    else:
        return False
Ejemplo n.º 5
0
def fill_rowObj_fromRow(rowObj, header_map, args):
    '''
    if we know the row
    grip the row data
    transform into rowObj
    '''
    loggr("filling rowObj with row" + str(rowObj.row) + " data from " +
          args.sheet + " in mro.fill_rowObj_fromRow()")
    print("filling rowObj with row" + str(rowObj.row) + " data from " +
          args.sheet + " in mro.fill_rowObj_fromRow()")
    rowData = args.worksheet.row_values(rowObj.row)
    for name, letter in header_map.items():
        indx = ord(letter)
        rowObj.data[name] = rowData[indx - 65]  #assigns every key in data:{}
        rowObj.identifier = args.worksheet.acell("A" + str(rowObj.row)).value
    loggr("rowObj fill from row complete")
    print("rowObj fill from row complete")
    return rowObj
Ejemplo n.º 6
0
def clean_header_column_map(header_column_map, header_row):
    '''
    normalized map based on clean header info
    '''
    loggr("cleaning header_column_map in mro.clean_header_column_map()")
    print("cleaning header_column_map in mro.clean_header_column_map()")
    header_map = dotdict({})
    for header in header_row:
        header_map[header] = header_column_map[header]
    loggr(
        "header_column_map cleaning completed in mro.clean_header_column_map()"
    )
    print(
        "header_column_map cleaning completed in mro.clean_header_column_map()"
    )
    #loggr(header_column_map)
    #print(header_column_map)
    return header_map
Ejemplo n.º 7
0
def main():
    '''
    do the thing
    '''
    loggr("move_data.py started at " + str(datetime.now()))
    print("move_data.py started at " + str(datetime.now()))
    args = init()
    args.sheet = 'catalog'
    if platform == "linux" or platform == "linux2":
        args.Dropbox = "/root/Dropbox/MF archival audio"
        args.traffic = "/mnt/nas/traffic"
        args.nas = "/mnt/nas"
    elif platform == "darwin":
        args.Dropbox = "/root/Dropbox/MF archival audio"
        args.traffic = "/Volumes/NAS_Public/traffic"
        args.nas = "/Volumes/NAS_Public"
    loggr(args)
    if args.hasher:
        args = gh.get_worksheet(args)
        hasher(args.hasher, args)
    if args.it or args.io:
        inventory_directory(args)
    if args.mdtt:
        try:
            moveDropboxToTraffic(args)
        except:
            loggr("moveDropboxToTraffic didn't work :(")
Ejemplo n.º 8
0
def inventory_directory(args):
    '''
    send file data to catalog
    get dir and filename lists from catalog - convert to list of single paths
    generate list of single paths for traffic
    if not in catalog:
    add filename, path, assign uid
    '''
    if args.it:
        args.path = args.traffic
    elif args.io:
        args.path = args.io
    loggr("getting list of files from " + args.path +
          " in md.inventory_directory()")
    print("getting list of files from " + args.path +
          " in md.inventory_directory()")
    for dirs, subdirs, files in os.walk(args.path):
        for file in files:
            if not file.endswith(".zip") and not file.startswith(
                    ".") and not file.endswith(".xml"):
                if args.start:
                    if int(args.start) <= int(file[:14]):
                        process_single_file(os.path.join(dirs, file), args)
                        loggr("sleeping for 60s for API reset")
                        print("sleeping for 60s for API reset")
                        time.sleep(60)
                    else:
                        continue
                else:
                    process_single_file(os.path.join(dirs, file), args)
                    loggr("sleeping for 60s for API reset")
                    print("sleeping for 60s for API reset")
                    time.sleep(60)
Ejemplo n.º 9
0
def hash_file(filepath):
    '''
    uses shasum to create SHA1 hash of file
    '''
    loggr('attempting to hash file ' + filepath)
    print('attempting to hash file ' + filepath)
    try:
        output = subprocess.check_output("shasum '" + filepath + "'",
                                         shell=True)
    except subprocess.CalledProcessError as e:
        return False
    match = ''
    #search for 40 consecutive word characters in string, convert byte output from shasum in CLI to utf-8 string
    match = re.search(r'\w{40}', output.decode("utf-8"))
    if match:
        #convert match object to string
        thehash = match.group()
        loggr("file " + os.path.basename(filepath) + " hash is " + thehash)
        print("file " + os.path.basename(filepath) + " hash is " + thehash)
        return thehash
    else:
        return False
Ejemplo n.º 10
0
def make_single_file_inventory(file, row, rowObj, uids, header_map, args):
    '''
    generates catalog data for a single file
    '''
    fullpath = os.path.join(args.path, file)
    rowObj.row = row + 1
    rowObj.data.filename = file
    uidInFile = mfd.get_uid_from_file(fullpath)
    if uidInFile:
        rowObj.identifier = uidInFile
    else:
        rowObj.identifier = int(mtd.get_last_uid()) + 1
    args.uid = rowObj.identifier
    rowObj.data['SHA1 hash - on RAID'] = mfd.hash_file(fullpath)
    loggr(rowObj)
    if not rowObj.identifier in uids:
        rowObj.data.duration = mfd.get_file_duration(fullpath)
        gh.update_cell_value("A" + str(rowObj.row), rowObj.identifier,
                             args.worksheet)
        for key, value in rowObj.data.items():
            gh.update_cell_value(header_map[key] + str(rowObj.row), value,
                                 args.worksheet)
    return rowObj
Ejemplo n.º 11
0
def init():
    '''
    initialize vars
    '''
    loggr("initializing variables")
    parser = argparse.ArgumentParser(description="makes the metadata ~flow~")
    parser.add_argument(
        '--moveDropboxToTraffic',
        dest='mdtt',
        action='store_true',
        help="moves file from Dropbox folder to traffic on NAS")
    parser.add_argument('--inventoryTraffic',
                        dest="it",
                        default=False,
                        action='store_true',
                        help="send file data from traffic to catalog")
    parser.add_argument(
        '--inventoryOther',
        dest='io',
        default=False,
        help="the top-level path that you would like to inventory")
    parser.add_argument(
        '--overwriteOK',
        dest='ook',
        action='store_true',
        default=False,
        help='allow re-upload of catalog data for existing entries')
    parser.add_argument('--start',
                        dest='start',
                        type=int,
                        default=0,
                        help="the starting row number number")
    parser.add_argument('--hasher',
                        dest="hasher",
                        help="hash all the files in a directory")
    args = parser.parse_args()
    return args
Ejemplo n.º 12
0
def get_header(args):
    '''
    returns list of header row of given sheet
    '''
    if not args.worksheet:
        loggr("getting ws data in mro.get_header()")
        print("getting ws data in mro.get_header()")
        worksheet = gh.get_worksheet(args)
    loggr("retrieving header row from " + args.sheet + " in mro.get_header()")
    print("retrieving header row from " + args.sheet + " in mro.get_header()")
    header_row = args.worksheet.row_values(1)
    loggr("creating header_column_map from " + args.sheet +
          " in mro.get_header()")
    print("creating header_column_map from " + args.sheet +
          " in mro.get_header()")
    header_column_map = make_header_column_map(header_row)
    loggr("header_row and header_column_map created in mro.get_header")
    print("header_row and header_column_map created in mro.get_header")
    return header_row, header_column_map
Ejemplo n.º 13
0
def get_last_uid(args):
    '''
    searches for highest number UID in sheets
    '''
    loggr("getting last/ highest uid from catalog in mtd.get_last_uid()")
    print("getting last/ highest uid from catalog in mtd.get_last_uid()")
    uids = []
    loggr("getting uids from " + sheet)
    print("getting uids from " + sheet)
    worksheet = args.spreadsheet.worksheet(args.sheet)
    _uids = worksheet.col_values(1)
    uids = uids + _uids[1:]
    last_uid = max(uids)
    loggr("last uid is " + str(last_uid))
    print("last uid is " + str(last_uid))
    return last_uid
Ejemplo n.º 14
0
def make_header_column_map(header_row):
    '''
    maps headers to ABCD etc
    '''
    loggr("initializing header_column_map in mro.make_header_column_map()")
    print("initializing header_column_map in mro.make_header_column_map()")
    header_column_map = dotdict({})
    loggr("filling header_column_map in mro.make_header_column_map()")
    print("filling header_column_map in mro.make_header_column_map()")
    for header in header_row:
        _char = header_row.index(header)
        char = string.ascii_uppercase[_char]
        header_column_map[header] = char
        print(header + ":" + char)
    loggr("header_column_map created in mro.make_header_column_map()")
    print("header_column_map created in mro.make_header_column_map()")
    return header_column_map
Ejemplo n.º 15
0
def update_catalog(rowObj, catalog_rowObj, header_map, args):
    '''
    updates catalog with data generated from file
    '''
    loggr("updating catalog with file info in md.update_catalog()")
    print("updating catalog with file info in md.update_catalog()")
    for key, value in rowObj.data.items():
        loggr(key)
        loggr("rowObj value: " + str(value))
        if value:
            catalog_value = catalog_rowObj.data[key]
            loggr("catalog_value: " + catalog_value)
            if not catalog_value:
                loggr("no catalog value found for key " + key +
                      ", updating catalog")
                print("no catalog value found for key " + key +
                      ", updating catalog")
                cell = header_map[key] + str(rowObj.row)
                value = rowObj.data[key]
                loggr("updating cell " + cell + " with value " + value)
                print("updating cell " + cell + " with value " + value)
                gh.update_cell_value(cell, value, args.worksheet)
Ejemplo n.º 16
0
def is_file_cataloged(fullpath, args):
    '''
    tries to find a file in the catalog
    returns False if file not in catalog
    returns rowObj if file in catalog
    '''
    loggr("initializing rowObj and header map in mtd.is_file_cataloged()")
    print("initializing rowObj and header map in mtd.is_file_cataloged()")
    rowObj, header_map = make_rowObject.init_rowObject(args)  #get blank rowObj
    pprint(rowObj)
    pprint(header_map)
    loggr("initializing lists of filenames, locations, rows from " +
          args.sheet + " in mtd.is_file_cataloged()")
    print("initializing lists of filenames, locations, rows from " +
          args.sheet + " in mtd.is_file_cataloged()")
    indx = ord(header_map['filename']) - 64  #get column number of filename
    filenames = args.worksheet.col_values(
        indx)  #get list of filenames from spreadsheet column
    #filenames = filenames[1:] #remove header row from list of filename
    indx = ord(header_map['RAID-dir']) - 64  #get column number of dir
    dirs = args.worksheet.col_values(
        indx)  #get list of directories from spreadsheet column
    dirs = dirs[1:]  #remove header row from list of dirs
    uids = args.worksheet.col_values(
        1)  #get list of uids - whicha re always in column 1/A
    #uids = uids[1:]
    fname = os.path.basename(fullpath)
    #fpath = fullpath.replace(fname, "")
    loggr("initialization complete in mtd.is_file_cataloged()")
    print("initialization complete in mtd.is_file_cataloged()")
    if fname in filenames:
        loggr("file is cataloged")
        print("file is cataloged")
        rowObj.row = filenames.index(
            fname
        ) + 1  #need to add 1 here because list indexes start at 0 and rows in Sheets start at 1
        loggr("row is " + str(rowObj.row))
        print("row is " + str(rowObj.row))
        loggr("filling rowObj with row data in mtd.is_file_cataloged()")
        print("filling rowObj with row data in mtd.is_file_cataloged()")
        rowObj = make_rowObject.fill_rowObj_fromRow(rowObj, header_map, args)
        loggr("rowObj full in mtd.is_file_cataloged()")
        print("rowObj full in mtd.is_file_cataloged()")
        pprint(rowObj)
        return rowObj, header_map
    else:
        loggr("file is not cataloged")
        print("file is not cataloged")
        return False, header_map
Ejemplo n.º 17
0
def moveDropboxToTraffic(args):
    '''
    checks if file is done copying - see if Dropbox is syncing currently?
    verify file not on NAS via catalog hash
    move file in tree to /NAS_Public/traffic
    '''
    if not os.path.exists(args.traffic):
        loggr("mount the NAS before continuing!", **{"level": "error"})
        print("mount the NAS before continuing!")
        exit()
    loggr("args.Dropbox is " + args.Dropbox)
    for dirs, subdirs, files in os.walk(args.Dropbox):
        for file in files:
            if not "." in file:
                continue
            elif not ".tmp" in file and not file.startswith("."):
                loggr("processing file " + file)
                print("processing file " + file)
                fullpath = os.path.join(dirs, file)
                with cd(args.Dropbox):
                    output = subprocess.check_output('dropbox filestatus "' +
                                                     file + '"',
                                                     shell=True)
                    output = output.decode("utf-8")
                    loggr(output)
                #output = "/root/Dropbox/MF archival audio/20170225_PalmDesertAct2_T585.mp3: up to date"
                outList = output.split(":")
                status = outList[1].lstrip()
                loggr(status)
                if "up to date" in status:
                    loggr("retrieving worksheet " + args.sheet +
                          " in md.inventory_directory()")
                    print("retrieving worksheet " + args.sheet +
                          " in md.inventory_directory()")
                    args = gh.get_worksheet(args)
                    loggr(
                        "checking if file is cataloged in md.inventory_directory()"
                    )
                    print(
                        "checking if file is cataloged in md.inventory_directory()"
                    )
                    file_is_cataloged, header_map = mtd.is_file_cataloged(
                        os.path.join(args.Dropbox, file), args)
                    loggr(file_is_cataloged)
                    if not file_is_cataloged:
                        loggr("file is not cataloged")
                        print("file is not cataloged")
                        loggr("copying " + file)
                        print("copying " + file)
                        subprocess.check_output('rsync -av --progress "' +
                                                fullpath + '" ' + args.traffic,
                                                shell=True)
                    else:
                        loggr("file " + file + " is cataloged")
                        print("file " + file + " is cataloged")
                else:
                    loggr("still copying " + outList[0])
                    print("still copying " + outList[0])
            loggr("resting 30s for API reset")
            print("resting 30s for API reset")
            time.sleep(30)
Ejemplo n.º 18
0
def process_single_file(fullpath, args):
    '''
    runs a single file through the process
    '''
    loggr("processing file " + os.path.basename(fullpath))
    print("processing file " + os.path.basename(fullpath))
    loggr("retrieving worksheet " + args.sheet +
          " in md.inventory_directory()")
    print("retrieving worksheet " + args.sheet +
          " in md.inventory_directory()")
    args = gh.get_worksheet(args)
    loggr("checking if file is cataloged in md.inventory_directory()")
    print("checking if file is cataloged in md.inventory_directory()")
    file_is_cataloged, header_map = mtd.is_file_cataloged(fullpath, args)
    rowObj, _header_map = mtd.is_file_cataloged(fullpath, args)
    loggr(file_is_cataloged)
    pprint(file_is_cataloged)
    if file_is_cataloged:
        if rowObj.data.filedata_complete == 'FALSE':
            loggr("filling rowObj from filedata in md.inventory_directory()")
            print("filling rowObj from filedata in md.inventory_directory()")
            rowObj = mfd.fill_rowObj_fromFile(fullpath, rowObj, args)
            loggr("rowObj")
            loggr(rowObj)
        loggr("file is cataloged")
        print("file_is_cataloged")
        loggr(file_is_cataloged)
        if not rowObj.identifier:
            loggr("no identifier in catalog or filename, generating new uid")
            print("no identifier in catalog or filename, generating new uid")
            last_uid = mtd.get_last_uid(args)
            rowObj.identifier = str(int(last_uid) + 1)
            loggr("uid is " + rowObj.identifier)
            print("uid is " + rowObj.identifier)
        loggr("sending updates to catalog in md.inventory_directory()")
        print("sending updates to catalog in md.inventory_directory()")
        update_catalog(rowObj, file_is_cataloged, header_map, args)
Ejemplo n.º 19
0
def init_rowObject(args):
    '''
    initalizes blank rowObject and map of header -> letter mapping, e.g. identifier:A
    '''
    loggr("initializing rowObj in init_rowObject()")
    print("initializing rowObj in init_rowObject()")
    rowObj = dotdict({"identifier": "", "row": "", "data": {}})
    loggr(
        "getting header_row and creating header_column_map in mro.init_rowObject()"
    )
    print(
        "getting header_row and creating header_column_map in mro.init_rowObject()"
    )
    header_row, header_column_map = get_header(args)
    loggr(header_row)
    loggr(header_column_map)
    loggr("cleaning header row in mro.init_rowObj()")
    print("cleaning header row in mro.init_rowObj()")
    header_row = clean_header_row(header_row)
    loggr(header_row)
    loggr("cleaning header_column_map in mro.init_rowObj()")
    print("cleaning header_column_map in mro.init_rowObj()")
    header_map = clean_header_column_map(header_column_map, header_row)
    loggr(header_map)
    #convert list to dotdict
    loggr("initializing rowObj.data dotdict in mro.init_rowObject()")
    print("initializing rowObj.data dotdict in mro.init_rowObject()")
    for header in header_row:
        rowObj.data[header] = ""
    rowObj.data = dotdict(rowObj.data)
    loggr("rowObj and header_map initialized in mro.init_rowObj")
    print("rowObj and header_map initialized in mro.init_rowObj")
    return rowObj, header_map