Example #1
0
 def run(self, md5, coll):
     if md5:
         u = Upload.objects.filter(md5=md5).first()
         indexUpload(u)
     elif coll:
         c = Collection.objects.filter(id=coll).first()
         for ct in c.things:
             for u in ct.thing.files:
                 indexUpload(u)
     else:
         for u in Upload.objects().order_by('-created_at').all():
             try:
                 indexUpload(u)
             except PDFSyntaxError:
                 print '- Skipping... syntax error'
             except PSEOF:
                 print '- Skipping... unexplained EOF'
Example #2
0
 def run(self, md5, coll):
     if md5:
         u = Upload.objects.filter(md5=md5).first()
         indexUpload(u)
     elif coll:
         c = Collection.objects.filter(id=coll).first()
         for ct in c.things:
             for u in ct.thing.files:
                 indexUpload(u)
     else:
         for u in Upload.objects().order_by('-created_at').all():
             try:
                 indexUpload(u)
             except PDFSyntaxError:
                 print '- Skipping... syntax error'
             except PSEOF:
                 print '- Skipping... unexplained EOF'
Example #3
0
    def run(self, **kwargs):
        import csv
        # for pre-2014 data
        client = MongoClient()
        db = client.aaaart
        legacy_users = db.people
        # get a handle on pre-2013 data
        drupal_6_data_csv = os.path.join(
            app.config['UPLOADS_DIR'], 'drupal-6-files.csv')
        legacy_data_map = {}
        with open(drupal_6_data_csv, 'rb') as f:
            reader = csv.reader(f)
            legacy_data = list(reader)
            for d in legacy_data[1:]:
                parts = d[0].split(';')
                if len(parts) > 5:
                    file_size = parts[5]
                    user_id = parts[1]
                    filename = parts[2]
                    try:
                        legacy_user = legacy_users.find_one(
                            {"grr_id": int(user_id)})
                        if legacy_user and '_id' in legacy_user:
                            legacy_data_map[int(file_size)] = (
                                legacy_user['_id'], filename)
                    except:
                        print "skipping adding to legacy data map:", parts
        print "legacy map has size: ", len(legacy_data_map)
        empty_count = 0
        fixed_count = 0
        for old_thing in db.images.find(timeout=False):
            for f in old_thing['files']:
                if not f['uploader']:
                    empty_count = empty_count + 1
                else:
                    continue  # its not empty so don't handle it
                if not f['size'] in legacy_data_map:
                    continue  # we don't have the old data either, so we can't handle it
                try:
                    sha1 = f['sha1'].encode(
                        'utf-8').strip() if 'sha1' in f else None
                except:
                    sha1 = None

                existing = Upload.objects(sha1=sha1).first() if sha1 else None
                if existing:
                    if not f['uploader']:
                        if f['size'] in legacy_data_map:
                            try:
                                fixed_count = fixed_count + 1
                                curr_user = User.objects.get(
                                    id=str(legacy_data_map[f['size']][0]))  # curr user
                                existing.creator = curr_user
                                existing.save()
                                print existing.structured_file_name, "should now be assigned to ", curr_user.email
                            except:
                                print "Can't associate %s with a user because user %s doesn't exist in new database" % (f['name'], str(legacy_data_map[f['size']][0]))
                                # print f
                        else:
                            print "can't find ", f['size']
                        # print "The uploader is empty for: ", f['name']
                else:
                    print f['name'], "doesn't seem to exist in 2014- database"
            if empty_count % 1000 == 1:
                print "There are %s bad files and this script fixed %s of them." % (empty_count, fixed_count)
        print "There were %s bad files and this script fixed %s of them." % (empty_count, fixed_count)
Example #4
0
    def run(self, **kwargs):
        import csv
        # for pre-2014 data
        client = MongoClient()
        db = client.aaaart
        legacy_users = db.people
        # get a handle on pre-2013 data
        drupal_6_data_csv = os.path.join(app.config['UPLOADS_DIR'],
                                         'drupal-6-files.csv')
        legacy_data_map = {}
        with open(drupal_6_data_csv, 'rb') as f:
            reader = csv.reader(f)
            legacy_data = list(reader)
            for d in legacy_data[1:]:
                parts = d[0].split(';')
                if len(parts) > 5:
                    file_size = parts[5]
                    user_id = parts[1]
                    filename = parts[2]
                    try:
                        legacy_user = legacy_users.find_one(
                            {"grr_id": int(user_id)})
                        if legacy_user and '_id' in legacy_user:
                            legacy_data_map[int(file_size)] = (
                                legacy_user['_id'], filename)
                    except:
                        print "skipping adding to legacy data map:", parts
        print "legacy map has size: ", len(legacy_data_map)
        empty_count = 0
        fixed_count = 0
        for old_thing in db.images.find(timeout=False):
            for f in old_thing['files']:
                if not f['uploader']:
                    empty_count = empty_count + 1
                else:
                    continue  # its not empty so don't handle it
                if not f['size'] in legacy_data_map:
                    continue  # we don't have the old data either, so we can't handle it
                try:
                    sha1 = f['sha1'].encode(
                        'utf-8').strip() if 'sha1' in f else None
                except:
                    sha1 = None

                existing = Upload.objects(sha1=sha1).first() if sha1 else None
                if existing:
                    if not f['uploader']:
                        if f['size'] in legacy_data_map:
                            try:
                                fixed_count = fixed_count + 1
                                curr_user = User.objects.get(
                                    id=str(legacy_data_map[f['size']]
                                           [0]))  # curr user
                                existing.creator = curr_user
                                existing.save()
                                print existing.structured_file_name, "should now be assigned to ", curr_user.email
                            except:
                                print "Can't associate %s with a user because user %s doesn't exist in new database" % (
                                    f['name'],
                                    str(legacy_data_map[f['size']][0]))
                                # print f
                        else:
                            print "can't find ", f['size']
                        # print "The uploader is empty for: ", f['name']
                else:
                    print f['name'], "doesn't seem to exist in 2014- database"
            if empty_count % 1000 == 1:
                print "There are %s bad files and this script fixed %s of them." % (
                    empty_count, fixed_count)
        print "There were %s bad files and this script fixed %s of them." % (
            empty_count, fixed_count)