def get_scheme(self, url): ''' 获取去重模式 ''' scheme = "%s%s" % (self.get_path(url), self.get_params(url)) scheme = md5(scheme) return scheme
def prepare(self): self.ammunitions = [] _num = random.randint(6, 18) _real = random.randint(1, 18) logger.info("[+] Mix Flow Number : %d" % _num) for i in range(_num): rsa = 0 if random.choice([1, 0]): shell_name = self.random_shell_name() else: shell_name = self.shell if _real == i: # 攻击流量 shell_name = self.shell if self.passwd: data = {self.passwd: self.payload} else: data = self.payload rsa = 1 logger.debug("[#] Real Attack %s" % shell_name) elif i % 2 == 0: data = { 'p': md5(str(random.randint(1000000, 1000050))), 'c': self.random_data() } else: data = b64e(self.random_bytes()) rsa = 1 headers = {"User-Agent": random.choice(USER_AGENTS)} self.ammunitions.append({ "data": data, "headers": headers, "name": shell_name, "rsa": rsa })
def post(self): password = self.get_argument('password', '') newpassword = self.get_argument('newpassword', '') confirmpassword = self.get_argument('confirmpassword', '') user = BlogUser.get_user_by_username(self.get_current_user) email = self.get_argument('email', user.email) if password and newpassword and newpassword: if newpassword == confirmpassword: user = self.db.get("select * from py_user where username=%s limit 1", self.get_current_user) if user: if user.password == common.md5(password): rowcount = BlogUser.update_user(self.get_current_user, email, common.md5(newpassword)) if rowcount == 1: self.set_secure_cookie(AUTH_COOKIE_NAME, '', expires_days=-10) self.flash(u"密码修改成功,请重新登录。") else: self.flash(u"密码保存失败") else: self.flash(u"原密码验证不正确") else: self.flash(u"二次输入的密码不一致") else: BlogUser.update_user(self.get_current_user, email, user.password) self.redirect('')
def verify_user(phone, password): account = conn.db['account'] try: re = account.find_one({'phone': int(phone)}) return re is not None and re.has_key('password') and re[ 'password'] == common.md5(password) and re.has_key( 'super_member') and re['super_member'] == 1 except Exception as e: print(e) return False
def getImage(imgurl,lock=fileLock): content=getHtmlwithBaiduCookie(imgurl) if not content: print 'getImage failure' return imgname=md5(imgurl)+r'.jpg' saveasImage(imgname,content) lock.acquire() savetoDownloaded(imgurl,imgname) lock.release()
def verify_user(username, password): db = Database('verify_user') re = db.query( """select password from account where number='%s' and status = 1 """, (username, ), True) if re and re.has_key('password') and re['password'] == common.md5( password): return True return False
def post(self): username = self.get_argument('username', '') password = self.get_argument('password', '') rememberme = int(self.get_argument('rememberme', 1)) if self.db.get("select * from py_user where username=%s and password=%s limit 1", username, common.md5(password)): self.set_secure_cookie(AUTH_COOKIE_NAME, username, httponly=True, expires_days=rememberme) self.redirect('/admin/') else: self.flash(u'用户名与密码不匹配') self.redirect('/admin/login/')
def load_incoming_file_to_store(self, limit=DEF_LIMIT): for pair in self.dao.fetch_new_from_main(limit): fullname, src_md5, size = pair src_fullname = self.incoming + fullname dest_fullname = self.store + fullname dest_path, filename = os.path.split(dest_fullname) if not os.path.exists(dest_path): os.mkdir(dest_path) shutil.copyfile(src_fullname, dest_fullname) if (md5(dest_fullname) == src_md5): self.dao.update_file_info_to_main( {'status': '+1'}, ["fullname='" + fullname + "'"]) self.dao.add_file_info_to_local(fullname, src_md5, size)
def test(retry=None): db = Database('test') if retry == None: sql = "select * from news where status = 1" else: sql = "select * from news where status = 1 and news_id not in (select news_id from detail)" re = db.query(sql) del db for news in re: print insert_detail(common.md5(news['source_url']), news['news_id']) return '<center><h1> %d </h1></center>' % len(re)
def hotList(max=5, type=6): db = Database('hotlist') if type != 6: re = db.query( """select * from news where type = %d and status = 1 order by read_count desc limit %d offset 0""", (type, max)) else: re = db.query( """select * from news where status = 1 order by read_count desc limit %d offset 0""", (max, )) list = [] for news in re: news['target'] = common.md5(news['source_url']) news['order'] = re.index(news) + 1 list.append(news) return list
def create_user(username, password, email): # create a user, but check if user exists first if User.objects(username=username).count() > 0: raise ServerError(ServerErrorCodes.ERR_INPUT_ERROR, message='username already exists') # md5 hash the string hashed_password = md5(password, hash_secret) print(hashed_password) # save the user user = User() user.username = username user.password = hashed_password user.email = email user.save() return None
def post(self): import common username = self.get_argument('username', '') if not username: self.flash(u'请输入用户名') self.redirect('') else: user = BlogUser.get_user_by_username(username) if not user: self.flash(u'用户名不存在') self.redirect('') else: kvdata = {"user": username, "email": user.email} key = common.md5(common.randomstr(20)) common.set_kvdb_value(key, kvdata) url = 'http://%s/reset/%s/' % (self.request.host, key) common.sendmail(user.email, u'重置密码', url) self.flash(u'邮件发送成功') self.redirect('')
def insert_detail(target, id): if target == None or id == None: return '参数不能为空' if target == common.md5(s_meiyou.SOURCE_HOST): content = s_meiyou.detail(id) return content if target == __md5(s_dayima.SOURCE_HOST): content = s_dayima.detail(id) return content if target == __md5(s_yidianzixun.SOURCE_HOST): content = s_yidianzixun.detail(id) return content if target == __md5(s_sohu.SOURCE_HOST): content = s_sohu.detail(id) return content return '未能匹配到 target'
def post(self, key): import common value = common.get_kvdb_value(key) if not value: self.write(u'对不起,token已失效') return password = self.get_argument('password', '') confirm = self.get_argument('confirmpassword', '') if not password or not confirm: self.flash(u'密码不能为空') self.redirect('') return if password != confirm: self.flash(u'二次输入的密码不一致') self.redirect('') return BlogUser.update_user(value['user'], value['email'], common.md5(password)) common.delete_kvdb(key) self.set_secure_cookie(AUTH_COOKIE_NAME, '', expires_days=-7) self.write(u'密码重置成功')
def update_md5(db): limit = 1000 skip = 0 while True: rows = db.fetchall( "select id, url from " + table_name + " where id <= %s and id > %s", (limit + skip, skip)) if not len(rows): break skip += limit for row in rows: id, url = row if not url: continue db.execute( "update " + table_name + " set url_md5='%s' where id = '%s'", (md5(url), id)) db.commit() db.close()
def process_files(root, files, cursor, connection): for file in files: if file.startswith('.'): continue path_ = os.path.join(root, file) try: type_ = magic.from_file(path_, mime=True) except PermissionError: type_ = None try: md5_ = md5(path_) except PermissionError: md5_ = None size = os.path.getsize(path_) cursor.execute(''' SELECT EXISTS(SELECT * FROM files WHERE path_=?); ''', (path_,)) fetched = cursor.fetchone() if fetched == (1, ): cursor.execute(''' SELECT * FROM files WHERE path_=? AND (type_!=? OR md5!=? OR size!=?) ''', (path_, type_, md5_, size)) fetched = cursor.fetchone() if fetched: cursor.execute(''' UPDATE files SET (type_=?, md5_=?, size=?) WHERE path_=? ''', (type_, md5_, size, path_)) print('UPDATE PATH={}'.format(path_)) else: print('SKIP PATH={}'.format(path_)) else: cursor.execute(''' INSERT INTO files (path_, type_, md5, "size") VALUES (?, ?, ?, ?) ''', (path_, type_, md5_, size)) print('INSERT PATH={}'.format(path_)) connection.commit()
def main(input_bam, paired_end, spp_version): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. input_bam_file = dxpy.DXFile(input_bam) input_bam_filename = input_bam_file.name input_bam_basename = input_bam_file.name.rstrip('.bam') dxpy.download_dxfile(input_bam_file.get_id(), input_bam_filename) intermediate_TA_filename = input_bam_basename + ".tagAlign" if paired_end: end_infix = 'PE2SE' else: end_infix = 'SE' final_TA_filename = input_bam_basename + '.' + end_infix + '.tagAlign.gz' # =================== # Create tagAlign file # =================== out, err = common.run_pipe([ "bamToBed -i %s" % (input_bam_filename), r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'""", "tee %s" % (intermediate_TA_filename), "gzip -cn" ], outfile=final_TA_filename) # ================ # Create BEDPE file # ================ if paired_end: final_BEDPE_filename = input_bam_basename + ".bedpe.gz" # need namesorted bam to make BEDPE final_nmsrt_bam_prefix = input_bam_basename + ".nmsrt" final_nmsrt_bam_filename = final_nmsrt_bam_prefix + ".bam" samtools_sort_command = \ "samtools sort -n %s %s" % (input_bam_filename, final_nmsrt_bam_prefix) logger.info(samtools_sort_command) subprocess.check_output(shlex.split(samtools_sort_command)) out, err = common.run_pipe([ "bamToBed -bedpe -mate1 -i %s" % (final_nmsrt_bam_filename), "gzip -cn" ], outfile=final_BEDPE_filename) # ================================= # Subsample tagAlign file # ================================ logger.info("Intermediate tA md5: %s" % (common.md5(intermediate_TA_filename))) NREADS = 15000000 if paired_end: end_infix = 'MATE1' else: end_infix = 'SE' subsampled_TA_filename = \ input_bam_basename + \ ".filt.nodup.sample.%d.%s.tagAlign.gz" % (NREADS/1000000, end_infix) steps = [ 'grep -v "chrM" %s' % (intermediate_TA_filename), 'shuf -n %d --random-source=%s' % (NREADS, intermediate_TA_filename) ] if paired_end: steps.extend([r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'"""]) steps.extend(['gzip -cn']) out, err = common.run_pipe(steps, outfile=subsampled_TA_filename) logger.info("Subsampled tA md5: %s" % (common.md5(subsampled_TA_filename))) # Calculate Cross-correlation QC scores CC_scores_filename = subsampled_TA_filename + ".cc.qc" CC_plot_filename = subsampled_TA_filename + ".cc.plot.pdf" # CC_SCORE FILE format # Filename <tab> # numReads <tab> # estFragLen <tab> # corr_estFragLen <tab> # PhantomPeak <tab> # corr_phantomPeak <tab> # argmin_corr <tab> # min_corr <tab> # phantomPeakCoef <tab> # relPhantomPeakCoef <tab> # QualityTag spp_tarball = SPP_VERSION_MAP.get(spp_version) assert spp_tarball, "spp version %s is not supported" % (spp_version) # install spp subprocess.check_output(shlex.split('R CMD INSTALL %s' % (spp_tarball))) # run spp run_spp_command = '/phantompeakqualtools/run_spp_nodups.R' out, err = common.run_pipe([ "Rscript %s -c=%s -p=%d -filtchr=chrM -savp=%s -out=%s" % (run_spp_command, subsampled_TA_filename, cpu_count(), CC_plot_filename, CC_scores_filename) ]) out, err = common.run_pipe( [r"""sed -r 's/,[^\t]+//g' %s""" % (CC_scores_filename)], outfile="temp") out, err = common.run_pipe(["mv temp %s" % (CC_scores_filename)]) tagAlign_file = dxpy.upload_local_file(final_TA_filename) if paired_end: BEDPE_file = dxpy.upload_local_file(final_BEDPE_filename) CC_scores_file = dxpy.upload_local_file(CC_scores_filename) CC_plot_file = dxpy.upload_local_file(CC_plot_filename) xcor_qc = xcor_parse(CC_scores_filename) # Return the outputs output = { "tagAlign_file": dxpy.dxlink(tagAlign_file), "CC_scores_file": dxpy.dxlink(CC_scores_file), "CC_plot_file": dxpy.dxlink(CC_plot_file), "paired_end": paired_end, "RSC": float(xcor_qc.get('relPhantomPeakCoef')), "NSC": float(xcor_qc.get('phantomPeakCoef')), "est_frag_len": float(xcor_qc.get('estFragLen')) } if paired_end: output.update({"BEDPE_file": dxpy.dxlink(BEDPE_file)}) return output
def main(): import argparse parser = argparse.ArgumentParser( description=__doc__, epilog=EPILOG, formatter_class=argparse.RawDescriptionHelpFormatter, ) parser.add_argument('--key', default='default', help="The keypair identifier from the keyfile for the server. Default is --key=default") parser.add_argument('--keyfile', default=os.path.expanduser("~/keypairs.json"), help="The keypair file. Default is --keyfile=%s" % (os.path.expanduser("~/keypairs.json"))) parser.add_argument('--infile', '-i', help="CSV file with metadata to update") parser.add_argument('--dryrun', default=False, action='store_true', help="Do everything except save changes") parser.add_argument('--debug', default=False, action='store_true', help="Print debug messages. Default is False.") parser.add_argument('--put', default=False, action='store_true', help="If property in the input is blank, remove that property entirely from the existing object") args = parser.parse_args() if args.debug: logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG) else: logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO) key = ENC_Key(args.keyfile, args.key) # get the keypair connection = ENC_Connection(key) # initialize the connection object # biosample_collection = ENC_Collection(connection,'biosamples',frame='object') with open(args.infile, 'rU') as f: reader = csv.DictReader(f, delimiter=',', quotechar='"') for new_metadata in reader: uuid = new_metadata.pop('uuid', None) accession = new_metadata.pop('accession', None) if uuid: # use the uuid if there is one obj_id = uuid elif accession: # if no uuid then use the accession obj_id = accession else: # if neither uuid or accession, assume this is a new object obj_id = None enc_object = ENC_Item(connection, obj_id) # print "Got accessioned object %s with status %s" %(enc_object.get('accession'), enc_object.get('status')) for prop in new_metadata: if new_metadata[prop].strip() == "": if args.put: # if empty, pop out the old property from the object old_value = enc_object.properties.pop(prop, None) continue # skip properties with no value for post or patch else: # new property or new value for old property new_metadata_string = new_metadata[prop] if ':' in prop: prop_name, sep, prop_type = prop.partition(':') else: prop_name = prop prop_type = 'string' if prop_type == 'array': # subreader = csv.reader(StringIO(new_metadata_string), delimiter=',', quotechar='"') # array_items = [] # for line in subreader: # for s in line: # array_items.append(s) print("new_metadata_string is %s" % (new_metadata_string)) array_items = json.loads(new_metadata_string) print("array_items is %s" % (array_items)) json_obj = {prop_name: array_items} elif prop_type == 'int' or prop_type == 'integer': json_obj = {prop_name: int(new_metadata_string)} elif prop_type == 'float': json_obj = {prop_name: float(new_metadata_string)} else: json_obj = {prop_name: new_metadata_string} # default is string enc_object.properties.update(json_obj) if 'submitted_file_name' in enc_object.properties: path = os.path.expanduser(enc_object.get('submitted_file_name')) path = os.path.abspath(path) basename = os.path.basename(path) enc_object.properties.update({ 'submitted_file_name': basename, 'md5sum': common.md5(path), 'file_size': os.path.getsize(path)}) if obj_id: logger.info('Syncing %s' % (obj_id)) else: logger.info('Syncing new object') logger.debug('%s' % (json.dumps(enc_object.properties, sort_keys=True, indent=4, separators=(',', ': ')))) if not args.dryrun: new_object = enc_object.sync() try: new_accession = new_object['accession'] except: pass else: print("New accession: %s" % (new_accession)) if enc_object.type == 'file' and 'submitted_file_name' in json_obj: upload_credentials = enc_object.new_creds() print(upload_credentials) rc = upload_file(upload_credentials, path) print("Upload rc: %d" % (rc))
def random_shell_name(self, ext=".php"): return "/" + md5(self.random_str()) + ext
#请求地址,需要压力测试的接口api # url = setUp_()[-1] + "/h5/speak/add" # 获取话题列表 # url = setUp_()[-1] + "/h5/comment/add" # 获取评论列表 url = setUp_()[-1] + "/h5/comment/getComment" #构造请求头 ts = setUp_()[0] reqId = setUp_()[1] secret = setUp_()[2] header = setUp_()[3] # db = setUp_()[4] userId = setUp_()[4] liveId = setUp_()[5] reqSign = reqId + ':' + secret + ':' + ts sign = md5(reqSign) # data = { # "id": reqId, # "timestamp": ts, # "sign": sign, # "data": { # "topicId": "290000451050003", # "type": "text", # "liveId": liveId, # "content": "我正在发言,后面是小尾巴是不是很神奇..." + ts, # "isReplay": "N", # "page": {"size": "20", "page": "1"}, # "userId": userId # } # } # 增加评论的
def main(input_bam, paired_end, spp_version): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. input_bam_file = dxpy.DXFile(input_bam) input_bam_filename = input_bam_file.name input_bam_basename = input_bam_file.name.rstrip('.bam') dxpy.download_dxfile(input_bam_file.get_id(), input_bam_filename) intermediate_TA_filename = input_bam_basename + ".tagAlign" if paired_end: end_infix = 'PE2SE' else: end_infix = 'SE' final_TA_filename = input_bam_basename + '.' + end_infix + '.tagAlign.gz' # =================== # Create tagAlign file # =================== out, err = common.run_pipe([ "bamToBed -i %s" % (input_bam_filename), r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'""", "tee %s" % (intermediate_TA_filename), "gzip -cn"], outfile=final_TA_filename) # ================ # Create BEDPE file # ================ if paired_end: final_BEDPE_filename = input_bam_basename + ".bedpe.gz" # need namesorted bam to make BEDPE final_nmsrt_bam_prefix = input_bam_basename + ".nmsrt" final_nmsrt_bam_filename = final_nmsrt_bam_prefix + ".bam" samtools_sort_command = \ "samtools sort -n %s %s" % (input_bam_filename, final_nmsrt_bam_prefix) logger.info(samtools_sort_command) subprocess.check_output(shlex.split(samtools_sort_command)) out, err = common.run_pipe([ "bamToBed -bedpe -mate1 -i %s" % (final_nmsrt_bam_filename), "gzip -cn"], outfile=final_BEDPE_filename) # ================================= # Subsample tagAlign file # ================================ logger.info( "Intermediate tA md5: %s" % (common.md5(intermediate_TA_filename))) NREADS = 15000000 if paired_end: end_infix = 'MATE1' else: end_infix = 'SE' subsampled_TA_filename = \ input_bam_basename + \ ".filt.nodup.sample.%d.%s.tagAlign.gz" % (NREADS/1000000, end_infix) steps = [ 'grep -v "chrM" %s' % (intermediate_TA_filename), 'shuf -n %d --random-source=%s' % (NREADS, intermediate_TA_filename)] if paired_end: steps.extend([r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'"""]) steps.extend(['gzip -cn']) out, err = common.run_pipe(steps, outfile=subsampled_TA_filename) logger.info( "Subsampled tA md5: %s" % (common.md5(subsampled_TA_filename))) # Calculate Cross-correlation QC scores CC_scores_filename = subsampled_TA_filename + ".cc.qc" CC_plot_filename = subsampled_TA_filename + ".cc.plot.pdf" # CC_SCORE FILE format # Filename <tab> # numReads <tab> # estFragLen <tab> # corr_estFragLen <tab> # PhantomPeak <tab> # corr_phantomPeak <tab> # argmin_corr <tab> # min_corr <tab> # phantomPeakCoef <tab> # relPhantomPeakCoef <tab> # QualityTag # spp_tarball = SPP_VERSION_MAP.get(spp_version) # assert spp_tarball, "spp version %s is not supported" % (spp_version) # # install spp # subprocess.check_output(shlex.split('R CMD INSTALL %s' % (spp_tarball))) # run spp run_spp_command = '/phantompeakqualtools/run_spp.R' out, err = common.run_pipe([ "Rscript %s -c=%s -p=%d -filtchr=chrM -savp=%s -out=%s" % (run_spp_command, subsampled_TA_filename, cpu_count(), CC_plot_filename, CC_scores_filename)]) out, err = common.run_pipe([ r"""sed -r 's/,[^\t]+//g' %s""" % (CC_scores_filename)], outfile="temp") out, err = common.run_pipe([ "mv temp %s" % (CC_scores_filename)]) tagAlign_file = dxpy.upload_local_file(final_TA_filename) if paired_end: BEDPE_file = dxpy.upload_local_file(final_BEDPE_filename) CC_scores_file = dxpy.upload_local_file(CC_scores_filename) CC_plot_file = dxpy.upload_local_file(CC_plot_filename) xcor_qc = xcor_parse(CC_scores_filename) # Return the outputs output = { "tagAlign_file": dxpy.dxlink(tagAlign_file), "CC_scores_file": dxpy.dxlink(CC_scores_file), "CC_plot_file": dxpy.dxlink(CC_plot_file), "paired_end": paired_end, "RSC": float(xcor_qc.get('relPhantomPeakCoef')), "NSC": float(xcor_qc.get('phantomPeakCoef')), "est_frag_len": float(xcor_qc.get('estFragLen')) } if paired_end: output.update({"BEDPE_file": dxpy.dxlink(BEDPE_file)}) return output
def accession_file(f, keypair, server, dryrun, force): #check for duplication #- if it has ENCFF or TSTFF number in it's tag, or #- if there exists an accessioned file with the same submitted_file_name that is not deleted, replaced, revoked and has the same size #- then there should be a file with the same md5. If not, warn of a mismatch between what's at DNAnexus and ENCODEd. #- If same md5, return the existing object. #- Next, check if there's already a file with the same md5. If it's deleted, replaced, revoked, then remodel it if --force=true, #- Else warn and return None #download #calculate md5 and add to f.md5sum #post file and get accession, upload credentials #upload to S3 #remove the local file (to save space) #return the ENCODEd file object logger.debug('in accession_file with f %s' %(pprint.pformat(f['submitted_file_name']))) dx = f.pop('dx') local_fname = dx.name logger.info("Downloading %s" %(local_fname)) dxpy.download_dxfile(dx.get_id(),local_fname) f.update({'md5sum': common.md5(local_fname)}) f['notes'] = json.dumps(f.get('notes')) #check to see if md5 already in the database url = server + '/md5:%s?format=json&frame=object' %(f.get('md5sum')) r = common.encoded_get(url, keypair, return_response=True) try: r.raise_for_status() except: if r.status_code == 404: logger.info('No md5 matches %s' %(f.get('md5sum'))) md5_exists = False else: logger.error('MD5 duplicate check. GET failed: %s %s' % (r.status_code, r.reason)) logger.error(r.text) md5_exists = None else: md5_exists = r.json() #check if an ENCODE accession number in in the list of tags, as it would be if accessioned by this script or similar scripts for tag in dx.tags: m = re.findall(r'ENCFF\d{3}\D{3}', tag) if m: logger.info('%s appears to contain ENCODE accession number in tag %s.' %(dx.get_id(),m)) accession_in_tag = True # if not force: # return else: accession_in_tag = False #TODO check here if file is deprecated and, if so, warn if md5_exists: if force: return patch_file(f, keypair, server, dryrun) else: logger.info("Returning duplicate file unchanged") return md5_exists else: logger.info('posting new file %s' %(f.get('submitted_file_name'))) logger.debug('%s' %(f)) new_file_object = post_file(f, keypair, server, dryrun) if new_file_object: creds = new_file_object['upload_credentials'] env = os.environ.copy() env.update({ 'AWS_ACCESS_KEY_ID': creds['access_key'], 'AWS_SECRET_ACCESS_KEY': creds['secret_key'], 'AWS_SECURITY_TOKEN': creds['session_token'], }) logger.info("Uploading file.") start = time.time() try: subprocess.check_call(['aws', 's3', 'cp', local_fname, creds['upload_url'], '--quiet'], env=env) except subprocess.CalledProcessError as e: # The aws command returns a non-zero exit code on error. logger.error("Upload failed with exit code %d" % e.returncode) else: end = time.time() duration = end - start logger.info("Uploaded in %.2f seconds" % duration) dx.add_tags([new_file_object.get('accession')]) try: os.remove(local_fname) except: pass return new_file_object
def main(): import argparse parser = argparse.ArgumentParser( description=__doc__, epilog=EPILOG, formatter_class=argparse.RawDescriptionHelpFormatter, ) parser.add_argument( '--key', default='default', help= "The keypair identifier from the keyfile for the server. Default is --key=default" ) parser.add_argument('--keyfile', default=os.path.expanduser("~/keypairs.json"), help="The keypair file. Default is --keyfile=%s" % (os.path.expanduser("~/keypairs.json"))) parser.add_argument('--infile', '-i', help="CSV file with metadata to update") parser.add_argument('--dryrun', default=False, action='store_true', help="Do everything except save changes") parser.add_argument('--debug', default=False, action='store_true', help="Print debug messages. Default is False.") parser.add_argument( '--put', default=False, action='store_true', help= "If property in the input is blank, remove that property entirely from the existing object" ) args = parser.parse_args() if args.debug: logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG) else: logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO) key = ENC_Key(args.keyfile, args.key) #get the keypair connection = ENC_Connection(key) #initialize the connection object #biosample_collection = ENC_Collection(connection,'biosamples',frame='object') with open(args.infile, 'rU') as f: reader = csv.DictReader(f, delimiter=',', quotechar='"') for new_metadata in reader: uuid = new_metadata.pop('uuid', None) accession = new_metadata.pop('accession', None) if uuid: #use the uuid if there is one obj_id = uuid elif accession: #if no uuid then use the accession obj_id = accession else: #if neither uuid or accession, assume this is a new object obj_id = None enc_object = ENC_Item(connection, obj_id) #print "Got accessioned object %s with status %s" %(enc_object.get('accession'), enc_object.get('status')) submit_new_file = False for prop in new_metadata: if new_metadata[prop].strip() == "": if args.put: #if empty, pop out the old property from the object old_value = enc_object.properties.pop(prop, None) continue #skip properties with no value for post or patch else: #new property or new value for old property new_metadata_string = new_metadata[prop] if ':' in prop: prop_name, sep, prop_type = prop.partition(':') else: prop_name = prop prop_type = 'string' if prop_type == 'array': # subreader = csv.reader(StringIO(new_metadata_string), delimiter=',', quotechar='"') # array_items = [] # for line in subreader: # for s in line: # array_items.append(s) logger.debug("new_metadata_string is %s" % (new_metadata_string)) array_items = json.loads(new_metadata_string) logger.debug("array_items is %s" % (array_items)) json_obj = {prop_name: array_items} elif prop_type == 'int' or prop_type == 'integer': json_obj = {prop_name: int(new_metadata_string)} elif prop_type == 'float': json_obj = {prop_name: float(new_metadata_string)} else: json_obj = { prop_name: new_metadata_string } #default is string if prop == 'submitted_file_name': new_filename = new_metadata_string old_filename = enc_object.properties[ 'submitted_file_name'] if new_filename != old_filename: submit_new_file = True enc_object.properties.update(json_obj) if submit_new_file: path = os.path.expanduser( enc_object.get('submitted_file_name')) path = os.path.abspath(path) basename = os.path.basename(path) enc_object.properties.update({ 'submitted_file_name': basename, 'md5sum': common.md5(path), 'file_size': os.path.getsize(path) }) if obj_id: logger.info('Syncing %s' % (obj_id)) else: logger.info('Syncing new object') logger.debug('%s' % (json.dumps(enc_object.properties, sort_keys=True, indent=4, separators=(',', ': ')))) result = enc_object.sync(args.dryrun) if not args.dryrun: try: assert result['status'] == 'success' except: logger.error('New object sync failed ... Skipping. %s' % (result)) else: new_object = result['@graph'][0] if 'accession' in new_object: new_id = new_object['accession'] else: new_id = new_object['uuid'] logger.info("New object: %s" % (new_id)) if enc_object.type == 'file' and 'submitted_file_name' in json_obj: upload_credentials = enc_object.new_creds() logger.debug(upload_credentials) rc = upload_file(upload_credentials, path) logger.info("Upload rc: %d" % (rc))
def accession_file(f, keypair, server, dryrun, force): #check for duplication #download #calculate md5 and add to f.md5sum #post file and get accession, upload credentials #upload to S3 #remove the local file (to save space) #return the ENCODEd file object already_accessioned = False dx = f.pop('dx') for tag in dx.tags: m = re.search(r'(ENCFF\d{3}\D{3})|(TSTFF\D{6})', tag) if m: logger.info( '%s appears to contain ENCODE accession number in tag %s ... skipping' % (dx.get_id(), m.group(0))) already_accessioned = True break if already_accessioned and not force: return url = urlparse.urljoin( server, 'search/?type=file&submitted_file_name=%s&format=json&frame=object' % (f.get('submitted_file_name'))) r = requests.get(url, auth=keypair) try: r.raise_for_status() if r.json()['@graph']: for duplicate_item in r.json()['@graph']: if duplicate_item.get('status') == 'deleted': logger.info( "A potential duplicate file was found but its status=deleted ... proceeding" ) duplicate_found = False else: logger.info("Found potential duplicate: %s" % (duplicate_item.get('accession'))) if submitted_file_size == duplicate_item.get('file_size'): logger.info( "%s %s: File sizes match, assuming duplicate." % (str(submitted_file_size), duplicate_item.get('file_size'))) duplicate_found = True break else: logger.info( "%s %s: File sizes differ, assuming new file." % (str(submitted_file_size), duplicate_item.get('file_size'))) duplicate_found = False else: duplicate_found = False except: logger.warning('Duplicate accession check failed: %s %s' % (r.status_code, r.reason)) logger.debug(r.text) duplicate_found = False if duplicate_found: if force: logger.info("Duplicate detected, but force=true, so continuing") else: logger.info("Duplicate detected, skipping") return local_fname = dx.name logger.info("Downloading %s" % (local_fname)) dxpy.download_dxfile(dx.get_id(), local_fname) f.update({'md5sum': common.md5(local_fname)}) f['notes'] = json.dumps(f.get('notes')) url = urlparse.urljoin(server, 'files/') if dryrun: logger.info("Dry run. Would POST %s" % (f)) new_file_object = {} else: r = requests.post(url, auth=keypair, headers={'content-type': 'application/json'}, data=json.dumps(f)) try: r.raise_for_status() new_file_object = r.json()['@graph'][0] logger.info("New accession: %s" % (new_file_object.get('accession'))) except: logger.warning('POST file object failed: %s %s' % (r.status_code, r.reason)) logger.debug(r.text) new_file_object = {} if r.status_code == 409: try: #cautiously add a tag with the existing accession number if calculated_md5 in r.json().get('detail'): url = urlparse.urljoin( server, '/search/?type=file&md5sum=%s' % (calculated_md5)) r = requests.get(url, auth=keypair) r.raise_for_status() accessioned_file = r.json()['@graph'][0] existing_accession = accessioned_file['accession'] dx.add_tags([existing_accession]) logger.info( 'Already accessioned. Added %s to dxfile tags' % (existing_accession)) except: logger.info( 'Conflict does not appear to be md5 ... continuing') if new_file_object: creds = new_file_object['upload_credentials'] env = os.environ.copy() env.update({ 'AWS_ACCESS_KEY_ID': creds['access_key'], 'AWS_SECRET_ACCESS_KEY': creds['secret_key'], 'AWS_SECURITY_TOKEN': creds['session_token'], }) logger.info("Uploading file.") start = time.time() try: subprocess.check_call([ 'aws', 's3', 'cp', local_fname, creds['upload_url'], '--quiet' ], env=env) except subprocess.CalledProcessError as e: # The aws command returns a non-zero exit code on error. logger.error("Upload failed with exit code %d" % e.returncode) upload_returncode = e.returncode else: upload_returncode = 0 end = time.time() duration = end - start logger.info("Uploaded in %.2f seconds" % duration) dx.add_tags([new_file_object.get('accession')]) else: upload_returncode = -1 try: os.remove(local_fname) except: pass return common.encoded_get( urlparse.urljoin(server, '/files/%s' % (new_file_object.get('accession')), keypair))
if __name__ == '__main__': conn = sqlite3.connect('files2.db') cursor = conn.cursor() cursor.execute(''' SELECT path_ FROM files ''') for row in cursor.fetchall(): absolute_path = row[0] if not os.path.exists(absolute_path): cursor.execute( ''' DELETE FROM files WHERE path_=? ''', (absolute_path, )) else: type_ = magic.from_file(absolute_path, mime=True) md5_ = md5(absolute_path) size = os.path.getsize(absolute_path) cursor.execute( ''' SELECT * FROM files WHERE path_=? AND (type_!=? OR md5!=? OR size!=?) ''', (absolute_path, type_, md5_, size)) fetched = cursor.fetchone() if fetched: cursor.execute( ''' UPDATE files SET type_=?, md5=?, size=? WHERE path_=? ''', (type_, md5_, size, absolute_path)) conn.commit()
def accession_file(f, keypair, server, dryrun, force): #check for duplication #download #calculate md5 and add to f.md5sum #post file and get accession, upload credentials #upload to S3 #remove the local file (to save space) #return the ENCODEd file object already_accessioned = False dx = f.pop('dx') for tag in dx.tags: m = re.search(r'(ENCFF\d{3}\D{3})|(TSTFF\D{6})', tag) if m: logger.info('%s appears to contain ENCODE accession number in tag %s ... skipping' %(dx.get_id(),m.group(0))) already_accessioned = True break if already_accessioned and not force: return url = urlparse.urljoin(server, 'search/?type=file&submitted_file_name=%s&format=json&frame=object' %(f.get('submitted_file_name'))) r = requests.get(url,auth=keypair) try: r.raise_for_status() if r.json()['@graph']: for duplicate_item in r.json()['@graph']: if duplicate_item.get('status') == 'deleted': logger.info("A potential duplicate file was found but its status=deleted ... proceeding") duplicate_found = False else: logger.info("Found potential duplicate: %s" %(duplicate_item.get('accession'))) submitted_file_size = dx.describe().get('size') if submitted_file_size == duplicate_item.get('file_size'): logger.info("%s %s: File sizes match, assuming duplicate." %(str(submitted_file_size), duplicate_item.get('file_size'))) duplicate_found = True break else: logger.info("%s %s: File sizes differ, assuming new file." %(str(submitted_file_size), duplicate_item.get('file_size'))) duplicate_found = False else: duplicate_found = False except: logger.warning('Duplicate accession check failed: %s %s' % (r.status_code, r.reason)) logger.debug(r.text) duplicate_found = False if duplicate_found: if force: logger.info("Duplicate detected, but force=true, so continuing") else: logger.info("Duplicate detected, skipping") return local_fname = dx.name logger.info("Downloading %s" %(local_fname)) dxpy.download_dxfile(dx.get_id(),local_fname) f.update({'md5sum': common.md5(local_fname)}) f['notes'] = json.dumps(f.get('notes')) url = urlparse.urljoin(server,'files/') if dryrun: logger.info("Dry run. Would POST %s" %(f)) new_file_object = {} else: r = requests.post(url, auth=keypair, headers={'content-type': 'application/json'}, data=json.dumps(f)) try: r.raise_for_status() new_file_object = r.json()['@graph'][0] logger.info("New accession: %s" %(new_file_object.get('accession'))) except: logger.warning('POST file object failed: %s %s' % (r.status_code, r.reason)) logger.warning(r.text) new_file_object = {} if r.status_code == 409: try: #cautiously add a tag with the existing accession number if calculated_md5 in r.json().get('detail'): url = urlparse.urljoin(server,'/search/?type=file&md5sum=%s' %(calculated_md5)) r = requests.get(url,auth=keypair) r.raise_for_status() accessioned_file = r.json()['@graph'][0] existing_accession = accessioned_file['accession'] dx.add_tags([existing_accession]) logger.info('Already accessioned. Added %s to dxfile tags' %(existing_accession)) except: logger.info('Conflict does not appear to be md5 ... continuing') if new_file_object: creds = new_file_object['upload_credentials'] env = os.environ.copy() env.update({ 'AWS_ACCESS_KEY_ID': creds['access_key'], 'AWS_SECRET_ACCESS_KEY': creds['secret_key'], 'AWS_SECURITY_TOKEN': creds['session_token'], }) logger.info("Uploading file.") start = time.time() try: subprocess.check_call(['aws', 's3', 'cp', local_fname, creds['upload_url'], '--quiet'], env=env) except subprocess.CalledProcessError as e: # The aws command returns a non-zero exit code on error. logger.error("Upload failed with exit code %d" % e.returncode) upload_returncode = e.returncode else: upload_returncode = 0 end = time.time() duration = end - start logger.info("Uploaded in %.2f seconds" % duration) dx.add_tags([new_file_object.get('accession')]) else: upload_returncode = -1 try: os.remove(local_fname) except: pass return common.encoded_get(urlparse.urljoin(server,'/files/%s' %(new_file_object.get('accession')), keypair))