def main():
	# parse options
	option, args = doc_optparse.parse(__doc__)
	
	if len(args) < 2:
		doc_optparse.exit()
	
	# try opening the file both ways, in case the arguments got confused
	try:
		gff_file = gff.input(args[1])
		twobit_file = twobit.input(args[0])
	except Exception:
		gff_file = gff.input(args[0])
		twobit_file = twobit.input(args[1])
	
	for record in gff_file:
		if record.seqname.startswith("chr"):
			chr = record.seqname
		else:
			chr = "chr" + record.seqname
		
		ref_seq = twobit_file[chr][(record.start - 1):record.end]
		
		if option.diff:
			if record.attributes.has_key("ref_allele"):
				if record.attributes["ref_allele"].strip("\"") == ref_seq.upper():
					continue
		
		record.attributes["ref_allele"] = ref_seq.upper()
		print record
Exemple #2
0
def main():
    # parse options
    option, args = doc_optparse.parse(__doc__)

    if len(args) < 2:
        doc_optparse.exit()  # Error
    elif len(args) < 3:
        out = match2dbSNP(args[0], args[1])
        for line in out:
            print line
    else:
        match2dbSNP_to_file(args[0], args[1], args[2])
def main():
    # parse options
    option, args = doc_optparse.parse(__doc__)

    if len(args) < 2:
        doc_optparse.exit()  # Error
    elif len(args) < 3:
        out = match2dbSNP(args[0], args[1])
        for line in out:
            print line
    else:
        match2dbSNP_to_file(args[0], args[1], args[2])
def main():
    """Match a GFF file against JSON-formatted GET-Evidence data"""
    # parse options
    option, args = doc_optparse.parse(__doc__)

    if len(args) < 2:
        doc_optparse.exit()  # Error
    elif len(args) < 3:
        out = match_getev(args[0], args[1])
        for line in out:
            print line
    else:
        match_getev_to_file(args[0], args[1], args[2])
Exemple #5
0
def main():
    """Match a GFF file against JSON-formatted GET-Evidence data"""
    # parse options
    option, args = doc_optparse.parse(__doc__)

    if len(args) < 2:
        doc_optparse.exit()  # Error
    elif len(args) < 3:
        out = match_getev(args[0], args[1])
        for line in out:
            print line
    else:
        match_getev_to_file(args[0], args[1], args[2])
def main():
    # return if we don't have the correct arguments
    # parse options
    option, args = doc_optparse.parse(__doc__)

    if len(args) < 3:
        doc_optparse.exit()  # Error
    elif len(args) < 4:
        out = predict_nonsynonymous(args[0], args[1], args[2])
        for line in out:
            print line
    else:
        predict_nonsynonymous_to_file(args[0], args[1], args[2], args[3])
def main():
    # return if we don't have the correct arguments
    # parse options
    option, args = doc_optparse.parse(__doc__)

    if len(args) < 3:
        doc_optparse.exit()  # Error
    elif len(args) < 4:
        out = predict_nonsynonymous(args[0], args[1], args[2])
        for line in out:
            print line
    else:
        predict_nonsynonymous_to_file(args[0], args[1], args[2], args[3])
def main():
	# parse options
	option, args = doc_optparse.parse(__doc__)
	
	if len(args) < 1:
		doc_optparse.exit()
	
	# first, try to connect to the databases
	try:
		connection = MySQLdb.connect(host=DB_HOST, user=GENOTYPE_USER, passwd=GENOTYPE_PASSWD, db=GENOTYPE_DATABASE)
		cursor = connection.cursor()
	except MySQLdb.OperationalError, message:
		print "Error %d while connecting to database: %s" % (message[0], message[1])
		sys.exit()
Exemple #9
0
def main():
	# parse options
	option, args = doc_optparse.parse(__doc__)
	
	if len(args) < 2:
		doc_optparse.exit()
	
	gff_files_1 = glob.glob(args[0])
	gff_files_2 = glob.glob(args[1])
	
	# create temporary files to store intersections
	temp_file_1 = TemporaryFile()
	temp_file_2 = TemporaryFile()
	
	if not option.enumerate:
		# use a wider column if we're going to need it
		if option.read_depth:
			col_width = 24
		elif option.verbose:
			col_width = 16
		else:
			col_width = 8
		
		# print column headings
		print " " * 8,
		for i in range(1, len(gff_files_1) + 1):
			print excel_column(i).ljust(col_width),
		print ""
	
	# initialize counter to print row headings
	file_number = 0
	
	# iterate through the second list of files
	for g2_path in gff_files_2:
		
		# print row heading
		if not option.enumerate:
			file_number += 1
			print str(file_number).ljust(8),
		
		# now iterate through the first list, do intersections and compare
		for g1_path in gff_files_1:
			
			# do the intersection one way
			g1 = gff.input(g1_path)
			g2 = gff.input(g2_path)
			for line in g1.intersect(g2):
				print >> temp_file_1, line
			
			# now do the intersection the other way
			g1_reverse = gff.input(g1_path)
			g2_reverse = gff.input(g2_path)
			for line in g2_reverse.intersect(g1_reverse):
				print >> temp_file_2, line
			
			# rewind each temporary file now storing intersection data
			temp_file_1.seek(0)
			temp_file_2.seek(0)
			
			# now go through the temporary files and work out concordancy
			g1_intx = gff.input(temp_file_1)
			g2_intx = gff.input(temp_file_2)
			matching_count = unmatching_count = 0
			# we cannot chain equal signs here, because the two would reference the
			# same list, and that would be bad...
			matching_read_depths, unmatching_read_depths = [], []
			
			for record1 in g1_intx:
				record2 = g2_intx.next()
				
				# these records should match in terms of the interval they represent
				if record2.seqname != record1.seqname or \
				  record2.start != record1.start or \
				  record2.end != record1.end:
				  	raise ValueError("files must be pre-sorted")
				
				# isolate the read depth info if we need to
				if option.read_depth:
					rd = []
					try:
						rd.append(int(record1.attributes["read_depth"].strip("\"")))
					except KeyError:
						pass
					try:
						rd.append(int(record2.attributes["read_depth"].strip("\"")))
					except KeyError:
						pass
				
				# now test if there's concordance
				try:
					if sorted(record2.attributes["alleles"].strip("\"").split("/")) != \
					  sorted(record1.attributes["alleles"].strip("\"").split("/")):
						unmatching_count += 1
						if option.enumerate:
							record1.attributes["concordant"] = "false"
							record2.attributes["concordant"] = "false"
							print record1
							print record2
						if option.read_depth:
							unmatching_read_depths.extend(rd)
					else:
						matching_count += 1
						if option.enumerate:
							record1.attributes["concordant"] = "true"
							record2.attributes["concordant"] = "true"
							print record1
							print record2
						if option.read_depth:
							matching_read_depths.extend(rd)
				# no alleles? not a SNP
				except KeyError:
					continue
			
			# now we print the result, being mindful of possible zero division problems, etc.
			if option.enumerate:
				pass
			elif option.read_depth:
				try:
					a = "%.1f" % mean(matching_read_depths)
					b = "%.1f" % median(matching_read_depths)
				except TypeError:
					a = "--"
					b = "--"
				try:
					c = "%.1f" % mean(unmatching_read_depths)
					d = "%.1f" % median(unmatching_read_depths)
				except TypeError:
					c = "--"
					d = "--"
				print ("%s %s : %s %s" % (a, b, c, d)).ljust(col_width),
			else:
				try:
					p = "%.1f%%" % (float(matching_count) / (matching_count + unmatching_count) * 100)
				except ZeroDivisionError:
					p = "--"
				if option.verbose:
					total_count = unmatching_count + matching_count
					print ("%s %s/%s" % (p, matching_count, total_count)).ljust(col_width),
				else:
					print p.ljust(col_width),
			
			# now we rewind, delete everything, and start again!
			temp_file_1.seek(0)
			temp_file_1.truncate()
			temp_file_2.seek(0)
			temp_file_2.truncate()
		
		# wrap up the line
		print ""
	
	# print the legend describing what the column and row headings mean
	if not option.enumerate:
		print "-" * 8
		file_number = 0
		for i in gff_files_1:
			file_number += 1
			print ("[%s]" % excel_column(file_number)).ljust(8),
			print i
		file_number = 0
		for i in gff_files_2:
			file_number += 1
			print ("[%s]" % file_number).ljust(8),
			print i
def main():
	# parse options
	option, args = doc_optparse.parse(__doc__)
	
	if option.stderr:
		sysin = sys.stdin.fileno()
		sysout = sys.stdout.fileno()
		syserr = sys.stderr.fileno()
		newout = file(option.stderr,'a+',0)
		sys.stderr.flush()
		sys.stdout.flush()
		os.close(sysin)
		os.close(sysout)
		os.dup2(newout.fileno(), sysout)
		os.close(sys.stderr.fileno())
		os.dup2(newout.fileno(), syserr)

	if option.pidfile:
		file(option.pidfile,'w+').write("%d\n" % os.getpid())

	# deal with the trackback option
	if option.trackback:
		if len(args) < 4:
			doc_optparse.exit()
		url = args[0]
		path = args[1]
		kind = args[2]
		request_token = args[3]
		params = urllib.urlencode({ 'path': path, 'kind': kind, 'request_token': request_token })            
		trackback(url, params)
		return
	
	# otherwise, figure out the host and port
	host = option.host or "localhost"
	port = int(option.port or 8080)
	
	# create server
	server = xrs((host, port))
	server.register_introspection_functions()
	
	def submit(genotype_file, coverage_file='', username=None, password=None):
		# get genotype file
		r = urllib2.Request(genotype_file)
		if username is not None:
			h = "Basic %s" % base64.encodestring('%s:%s' % (username, password)).strip()
			r.add_header("Authorization", h)
		handle = urllib2.urlopen(r)
		
		# write it to a temporary location while calculating its hash
		s = hashlib.sha1()
		output_handle, output_path = mkstemp()
		for line in handle:
			os.write(output_handle, line)
			s.update(line)
		os.close(output_handle)
		
		# now figure out where to store the file permanently
		permanent_dir = os.path.join(UPLOAD_DIR, s.hexdigest())
		permanent_file = os.path.join(permanent_dir, "genotype.gff")
		if not os.path.exists(permanent_dir):
			os.makedirs(permanent_dir)
			shutil.copy(output_path, permanent_file)
		
		# run the query
		submit_local(permanent_file)
		return s
	server.register_function(submit)
	
	def submit_local(genotype_file, coverage_file='', trackback_url='', request_token='', reprocess_all=False):
		# create output dir
		input_dir = os.path.dirname(genotype_file)
		output_dir = input_dir + "-out"
		try:
			if not os.path.exists(output_dir):
				os.makedirs(output_dir)
		except:
			print "Unexpected error:", sys.exc_info()[0]

		# cache phenotype/profile data locally if it is a special symlink
		if (os.path.islink(os.path.join(input_dir,"phenotype"))
		    and
		    re.match('warehouse://.*', os.readlink(os.path.join(input_dir,"phenotype")))):
			cmd = '''(
			set -e
			cd '%s'
			whget phenotype phenotype.$$
			mv phenotype phenotype-locator
			mv --no-target-directory phenotype.$$ phenotype
			) &''' % os.path.dirname(genotype_file)
			subprocess.call (cmd, shell=True)

		# fetch from warehouse if genotype file is special symlink
		fetch_command = "cat"
		if os.path.islink(genotype_file):
			if re.match('warehouse://.*', os.readlink(genotype_file)):
				fetch_command = "whget"

		# letters refer to scripts; numbers refer to outputs
		args = { 'reprocess_all': reprocess_all,
			 'A': os.path.join(script_dir, "gff_twobit_query.py"),
		         'B': os.path.join(script_dir, "gff_dbsnp_query.py"),
		         'C': os.path.join(script_dir, "gff_nonsynonymous_filter.py"),
		         'Z': os.path.join(script_dir, "trait-o-matic-server.py"),
		         'in': genotype_file,
			 'fetch': fetch_command,
		         'reference': REFERENCE_GENOME,
		         'url': trackback_url,
		         'token': request_token,
		         '1': os.path.join(output_dir, "genotype.gff"),
		         '2': os.path.join(output_dir, "genotype.dbsnp.gff"),
		         'ns_gff': os.path.join(output_dir, "ns.gff"),
			 'dbsnp_filters': "snpedia hugenetgwas",
			 'ns_filters': "omim hgmd morbid pharmgkb get-evidence",
			 'script_dir': script_dir,
			 'output_dir': output_dir,
			 'lockfile': os.path.join(output_dir, "lock"),
			 'logfile': os.path.join(output_dir, "log")}

		cmd = '''(
		flock --nonblock --exclusive 2 || exit
		set -x
		set -e
		cd '%(output_dir)s' || exit
		if [ ! -e '%(ns_gff)s' -o ! -e '%(1)s' -o '%(reprocess_all)s' != False ]
		then
			%(fetch)s '%(in)s' | gzip -cdf | python '%(A)s' '%(reference)s' /dev/stdin | egrep 'ref_allele [ACGTN]' > '%(1)s'.tmp
			mv '%(1)s'.tmp '%(1)s'

			python '%(B)s' '%(1)s' > '%(2)s'.tmp
			mv '%(2)s'.tmp '%(2)s'

			python '%(C)s' '%(2)s' '%(reference)s' > '%(ns_gff)s'.tmp
			mv '%(ns_gff)s'.tmp '%(ns_gff)s'
		fi
		python '%(script_dir)s'/gff2json.py '%(ns_gff)s' > ns.json.tmp
		mv ns.json.tmp ns.json
		python '%(script_dir)s'/json_allele_frequency_query.py ns.json --in-place

		jsons=""

		for filter in %(dbsnp_filters)s
		do
			python '%(script_dir)s'/gff_${filter}_map.py '%(2)s' > ${filter}.json.tmp
			mv ${filter}.json.tmp ${filter}.json
			python '%(script_dir)s'/json_allele_frequency_query.py "$filter.json" --in-place
			jsons="$jsons %(output_dir)s/${filter}.json"
		done

		for filter in %(ns_filters)s
		do
			python '%(script_dir)s'/gff_${filter}_map.py '%(ns_gff)s' > "$filter.json.tmp"
			mv "$filter.json.tmp" "$filter.json"
			python '%(script_dir)s'/json_allele_frequency_query.py "$filter.json" --in-place
			jsons="$jsons %(output_dir)s/${filter}.json"
		done
		python '%(script_dir)s'/json_to_job_database.py --drop-tables $jsons '%(output_dir)s'/ns.json
		touch README
		for filter in %(ns_filters)s %(dbsnp_filters)s ns
		do
			python '%(Z)s' -t '%(url)s' '%(output_dir)s'/$filter.json out/$filter '%(token)s'
		done
		python '%(Z)s' -t '%(url)s' '%(output_dir)s'/README out/readme '%(token)s'
		mv %(lockfile)s %(logfile)s
		) 2>>%(lockfile)s &''' % args
		subprocess.call(cmd, shell=True)
		return output_dir
	server.register_function(submit_local)

	def get_progress(genotype_file):
		output_dir = os.path.dirname(genotype_file) + "-out"
		lockfile = os.path.join(output_dir,'lock')
		logfile = os.path.join(output_dir,'log')
		# remove the lockfile if it is stale
		subprocess.call('flock --nonblock --exclusive %(lock)s mv %(lock)s %(log)s 2>/dev/null || true'
				% { "lock": lockfile,
				    "log": logfile }, shell=True)
		if os.path.exists(lockfile):
			return { "state": "processing" }
		else:
			return { "state": "finished" }
	server.register_function(get_progress)

	def copy_to_warehouse(genotype_file, coverage_file, phenotype_file, trackback_url='', request_token='', recopy=True, tag=False):
		output_dir = os.path.dirname(genotype_file)

		g_locator = _copy_file_to_warehouse (genotype_file, "genotype.gff", tag, "genotype")
		c_locator = _copy_file_to_warehouse (coverage_file, "coverage", tag, "coverage")
		p_locator = _copy_file_to_warehouse (phenotype_file, "profile.json", tag, "profile")
		if (g_locator != None and
		    c_locator != None and
		    p_locator != None):
			return (g_locator, c_locator, p_locator)
		return None
	server.register_function(copy_to_warehouse)

	def _copy_file_to_warehouse (source_file, target_filename=None, tag=False, data_type=None, trackback_url=None, recopy=True):
		if not source_file:
			return ''

		# if file is special symlink, return link target
		if os.path.islink(source_file):
			if re.match('warehouse://.*', os.readlink(source_file)):
				locator = os.readlink(source_file)
				_update_warehouse_name_list (locator, target_filename, tag, data_type)
				return locator

		# if file has already been copied to warehouse, do not recopy
		if not recopy and os.path.islink(source_file + '-locator'):
			locator = os.readlink(source_file + '-locator')
			_update_warehouse_name_list (locator, target_filename, tag, data_type)
			return locator

		# if copying is required, fork a child process and return now
		if os.fork() > 0:
			# wait for intermediate proc to fork & exit
			os.wait()
			# return existing locator if available
			if os.path.islink(source_file + '-locator'):
				return os.readlink(source_file + '-locator')
			return ''

		# double-fork avoids accumulating zombie child processes
		if os.fork() > 0:
			os._exit(0)

		if not target_filename:
			target_filename = os.path.basename (source_file)
		whput = subprocess.Popen(["whput",
					  "--in-manifest",
					  "--use-filename=%s" % target_filename,
					  source_file],
					 stdout=subprocess.PIPE)
		(locator, errors) = whput.communicate()
		ret = whput.returncode
		if ret == None:
			ret = whput.wait
		if ret == 0:
			locator = 'warehouse:///' + locator.strip() + '/' + target_filename
			try:
				os.symlink(locator, source_file + '-locator.tmp')
				os.rename(source_file + '-locator.tmp', source_file + '-locator')
				_update_warehouse_name_list (locator, target_filename, tag, data_type)
			except OSError:
				print >> sys.stderr, 'Ignoring error creating symlink ' + source_file + '-locator'
			if trackback_url:
				subprocess.call("python '%(Z)s' -t '%(url)s' '%(out)s' '%(source)s' '%(token)s'"
						% { 'Z': os.path.join (script_dir, "trait-o-matic-server.py"),
						    'url': trackback_url,
						    'out': locator,
						    'source': source_file,
						    'token': request_token })
			os._exit(0)
		os._exit(1)
	
	def _update_warehouse_name_list (locator, target_filename, tag, data_type):
		if tag:
			share_name = "/" + os.uname()[1] + "/Trait-o-matic/" + tag + "/" + data_type
			share_target = re.sub("warehouse:///", "", locator)
			old_target = warehouse.name_lookup (share_name)
			whargs = ["wh",
				  "manifest",
				  "name",
				  "name=" + share_name,
				  "newkey=" + share_target]
			if old_target:
				whargs.append ("oldkey=" + old_target)
			subprocess.call (whargs)

	# run the server's main loop
	server.serve_forever()
Exemple #11
0
def main():
    # parse options
    option, args = doc_optparse.parse(__doc__)

    # deal with the trackback option
    if option.trackback:
        if len(args) < 4:
            doc_optparse.exit()
        url = args[0]
        path = args[1]
        kind = args[2]
        request_token = args[3]
        params = urllib.urlencode({"path": path, "kind": kind, "request_token": request_token})
        trackback(url, params)
        return

        # otherwise, figure out the host and port
    host = option.host or "localhost"
    port = int(option.port or 8080)

    # create server
    server = xrs((host, port))
    server.register_introspection_functions()

    def submit(genotype_file, coverage_file="", username=None, password=None):
        # get genotype file
        r = urllib2.Request(genotype_file)
        if username is not None:
            h = "Basic %s" % base64.encodestring("%s:%s" % (username, password)).strip()
            r.add_header("Authorization", h)
        handle = urllib2.urlopen(r)

        # write it to a temporary location while calculating its hash
        s = hashlib.sha1()
        output_handle, output_path = mkstemp()
        for line in handle:
            os.write(output_handle, line)
            s.update(line)
        os.close(output_handle)

        # now figure out where to store the file permanently
        permanent_dir = os.path.join(UPLOAD_DIR, s.hexdigest())
        permanent_file = os.path.join(permanent_dir, "genotype.gff")
        if not os.path.exists(permanent_dir):
            os.makedirs(permanent_dir)
            shutil.copy(output_path, permanent_file)

            # run the query
        submit_local(permanent_file)
        return s

    server.register_function(submit)

    def submit_local(genotype_file, coverage_file="", trackback_url="", request_token=""):
        # execute script
        script_dir = os.path.dirname(sys.argv[0])

        # create output dir
        output_dir = os.path.dirname(genotype_file) + "-out"
        try:
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
        except:
            print "Unexpected error:", sys.exc_info()[0]

            # fetch from warehouse if genotype file is special symlink
        fetch_command = "cat"
        if os.path.islink(genotype_file):
            if re.match("warehouse://.*", os.readlink(genotype_file)):
                fetch_command = "whget"

                # letters refer to scripts; numbers refer to outputs
        args = {
            "A": os.path.join(script_dir, "gff_twobit_query.py"),
            "B": os.path.join(script_dir, "gff_dbsnp_query.py"),
            "C": os.path.join(script_dir, "gff_nonsynonymous_filter.py"),
            "D": os.path.join(script_dir, "gff_omim_map.py"),
            "E": os.path.join(script_dir, "gff_hgmd_map.py"),
            "F": os.path.join(script_dir, "gff_morbid_map.py"),
            "G": os.path.join(script_dir, "gff_snpedia_map.py"),
            "pharmgkb_bin": os.path.join(script_dir, "gff_pharmgkb_map.py"),
            "H": os.path.join(script_dir, "json_allele_frequency_query.py"),
            "I": os.path.join(script_dir, "json_to_job_database.py"),
            "Z": os.path.join(script_dir, "server.py"),
            "in": genotype_file,
            "fetch": fetch_command,
            "reference": REFERENCE_GENOME,
            "url": trackback_url,
            "token": request_token,
            "1": os.path.join(output_dir, "genotype.gff"),
            "2": os.path.join(output_dir, "genotype.dbsnp.gff"),
            "3": os.path.join(output_dir, "ns.gff"),
            "4": os.path.join(output_dir, "omim.json"),
            "5": os.path.join(output_dir, "hgmd.json"),
            "6": os.path.join(output_dir, "morbid.json"),
            "7": os.path.join(output_dir, "snpedia.json"),
            "pharmgkb_out": os.path.join(output_dir, "pharmgkb.json"),
            "8": "",
            "0": os.path.join(output_dir, "README"),
        }
        cmd = (
            """(
		%(fetch)s '%(in)s' | python '%(A)s' '%(reference)s' /dev/stdin > '%(1)s'
		python '%(B)s' '%(1)s' > '%(2)s'
		python '%(C)s' '%(2)s' '%(reference)s' > '%(3)s'
		python '%(D)s' '%(3)s' > '%(4)s'
		python '%(E)s' '%(3)s' > '%(5)s'
		python '%(F)s' '%(3)s' > '%(6)s'
		python '%(G)s' '%(2)s' > '%(7)s'
		python '%(pharmgkb_bin)s' '%(3)s' > '%(pharmgkb_out)s'
		python '%(H)s' '%(4)s' '%(5)s' '%(6)s' '%(7)s' '%(pharmgkb_out)s' --in-place
		python '%(I)s' --drop-tables '%(4)s' '%(5)s' '%(6)s' '%(7)s' '%(pharmgkb_out)s'
		touch '%(0)s'
		python '%(Z)s' -t '%(url)s' '%(4)s' 'out/omim' '%(token)s'
		python '%(Z)s' -t '%(url)s' '%(5)s' 'out/hgmd' '%(token)s'
		python '%(Z)s' -t '%(url)s' '%(6)s' 'out/morbid' '%(token)s'
		python '%(Z)s' -t '%(url)s' '%(7)s' 'out/snpedia' '%(token)s'
		python '%(Z)s' -t '%(url)s' '%(pharmgkb_out)s' 'out/pharmgkb' '%(token)s'
		python '%(Z)s' -t '%(url)s' '%(0)s' 'out/readme' '%(token)s'
		)&"""
            % args
        )
        subprocess.call(cmd, shell=True)
        return output_dir

    server.register_function(submit_local)

    def copy_to_warehouse(
        genotype_file, coverage_file, phenotype_file, trackback_url="", request_token="", recopy=True
    ):
        # execute script
        script_dir = os.path.dirname(sys.argv[0])
        output_dir = os.path.dirname(genotype_file)

        g_locator = _copy_file_to_warehouse(genotype_file, "genotype.gff")
        c_locator = _copy_file_to_warehouse(coverage_file, "coverage")
        p_locator = _copy_file_to_warehouse(phenotype_file, "phenotype.json")
        if g_locator != None and c_locator != None and p_locator != None:
            return (g_locator, c_locator, p_locator)
        return None

    server.register_function(copy_to_warehouse)

    def _copy_file_to_warehouse(source_file, target_filename=None, trackback_url=None, recopy=True):
        if not source_file:
            return ""

            # if file is special symlink, return link target
        if os.path.islink(source_file):
            if re.match("warehouse://.*", os.readlink(source_file)):
                return os.readlink(source_file)

                # if file has already been copied to warehouse, do not recopy
        if not recopy and os.path.islink(source_file + "-locator"):
            return os.readlink(source_file + "-locator")

            # if copying is required, fork a child process and return now
        if os.fork() > 0:
            # wait for intermediate proc to fork & exit
            os.wait()
            # return existing locator if available
            if os.path.islink(source_file + "-locator"):
                return os.readlink(source_file + "-locator")
            return ""

            # double-fork avoids accumulating zombie child processes
        if os.fork() > 0:
            os._exit(0)

        if not target_filename:
            target_filename = os.path.basename(source_file)
        whput = subprocess.Popen(
            ["whput", "--in-manifest", "--use-filename=%s" % target_filename, source_file], stdout=subprocess.PIPE
        )
        (locator, errors) = whput.communicate()
        ret = whput.returncode
        if ret == None:
            ret = whput.wait
        if ret == 0:
            locator = "warehouse:///" + locator.strip() + "/" + target_filename
            try:
                os.symlink(locator, source_file + "-locator.tmp")
                os.rename(source_file + "-locator.tmp", source_file + "-locator")
            except OSError:
                print >> sys.stderr, "Ignoring error creating symlink " + source_file + "-locator"
            if trackback_url:
                subprocess.call(
                    "python '%(Z)s' -t '%(url)s' '%(out)s' '%(source)s' '%(token)s'"
                    % {
                        "Z": os.path.join(script_dir, "server.py"),
                        "url": trackback_url,
                        "out": locator,
                        "source": source_file,
                        "token": request_token,
                    }
                )
            os._exit(0)
        os._exit(1)

        # run the server's main loop

    server.serve_forever()
Exemple #12
0
def main():
	# parse options
	option, args = doc_optparse.parse(__doc__)
	
	# deal with the trackback option
	if option.trackback:
		if len(args) < 4:
			doc_optparse.exit()
		url = args[0]
		path = args[1]
		kind = args[2]
		request_token = args[3]
		params = urllib.urlencode({ 'path': path, 'kind': kind, 'request_token': request_token })            
		trackback(url, params)
		return
	
	# otherwise, figure out the host and port
	host = option.host or "localhost"
	port = int(option.port or 8080)
	
	# create server
	server = xrs((host, port))
	server.register_introspection_functions()
	
	def submit(genotype_file, coverage_file='', username=None, password=None):
		# get genotype file
		r = urllib2.Request(genotype_file)
		if username is not None:
			h = "Basic %s" % base64.encodestring('%s:%s' % (username, password)).strip()
			r.add_header("Authorization", h)
		handle = urllib2.urlopen(r)
		
		# write it to a temporary location while calculating its hash
		s = hashlib.sha1()
		output_handle, output_path = mkstemp()
		for line in handle:
			os.write(output_handle, line)
			s.update(line)
		os.close(output_handle)
		
		# now figure out where to store the file permanently
		permanent_dir = os.path.join(UPLOAD_DIR, s.hexdigest())
		permanent_file = os.path.join(permanent_dir, "genotype.gff")
		if not os.exists(permanent_dir):
			os.makedirs(permanent_dir)
			shutil.move(output_path, permanent_file)
		
		# run the query
		submit_local(permanent_file)
		return s
	server.register_function(submit)
	
	def submit_local(genotype_file, coverage_file='', trackback_url='', request_token=''):
		# execute script
		script_dir = os.path.dirname(sys.argv[0])
		output_dir = os.path.dirname(genotype_file)
		# letters refer to scripts; numbers refer to outputs
		args = { 'A': os.path.join(script_dir, "gff_twobit_query.py"),
		         'B': os.path.join(script_dir, "gff_dbsnp_query.py"),
		         'C': os.path.join(script_dir, "gff_nonsynonymous_filter.py"),
		         'D': os.path.join(script_dir, "gff_omim_map.py"),
		         'E': os.path.join(script_dir, "gff_hgmd_map.py"),
		         'F': os.path.join(script_dir, "gff_morbid_map.py"),
		         'G': os.path.join(script_dir, "gff_snpedia_map.py"),
		         'H': os.path.join(script_dir, "json_allele_frequency_query.py"),
		         'I': os.path.join(script_dir, "json_to_job_database.py"),
		         'Z': os.path.join(script_dir, "server.py"),
		         'in': genotype_file,
		         'reference': REFERENCE_GENOME,
		         'url': trackback_url,
		         'token': request_token,
		         '1': os.path.join(output_dir, "genotype.gff"),
		         '2': os.path.join(output_dir, "genotype.dbsnp.gff"),
		         '3': os.path.join(output_dir, "ns.gff"),
		         '4': os.path.join(output_dir, "omim.json"),
		         '5': os.path.join(output_dir, "hgmd.json"),
		         '6': os.path.join(output_dir, "morbid.json"),
		         '7': os.path.join(output_dir, "snpedia.json"),
		         '8': "",
		         '0': os.path.join(output_dir, "README") }
		cmd = '''(
		python '%(A)s' '%(in)s' '%(reference)s' > '%(1)s'
		python '%(B)s' '%(1)s' > '%(2)s'
		python '%(C)s' '%(2)s' '%(reference)s' > '%(3)s'
		python '%(D)s' '%(3)s' > '%(4)s'
		python '%(E)s' '%(3)s' > '%(5)s'
		python '%(F)s' '%(3)s' > '%(6)s'
		python '%(G)s' '%(2)s' > '%(7)s'
		python '%(H)s' '%(4)s' '%(5)s' '%(6)s' '%(7)s' --in-place
		python '%(I)s' '%(4)s' '%(5)s' '%(6)s' '%(7)s'
		touch '%(0)s'
		python '%(Z)s' -t '%(url)s' '%(4)s' 'out/omim' '%(token)s'
		python '%(Z)s' -t '%(url)s' '%(5)s' 'out/hgmd' '%(token)s'
		python '%(Z)s' -t '%(url)s' '%(6)s' 'out/morbid' '%(token)s'
		python '%(Z)s' -t '%(url)s' '%(7)s' 'out/snpedia' '%(token)s'
		python '%(Z)s' -t '%(url)s' '%(0)s' 'out/readme' '%(token)s'
		)&''' % args
		subprocess.call(cmd, shell=True)
		return output_dir
	server.register_function(submit_local)
	
	# run the server's main loop
	server.serve_forever()
Exemple #13
0
def main():
	# parse options
	option, args = doc_optparse.parse(__doc__)
	
	if len(args) < 2:
		doc_optparse.exit()
	
	flank = int(option.flank or 0)
	
	# try opening the file both ways, in case the arguments got confused
	try:
		gff_file = gff.input(args[1])
		twobit_file = twobit.input(args[0])
	except Exception:
		gff_file = gff.input(args[0])
		twobit_file = twobit.input(args[1])
	
	# initialize a set of variables to keep track of uniqueness, if we need them
	if option.unique:
		previous_record = None
		previous_ref_seq = None
		repetition_count = 1
	
	for record in gff_file:
		# if we're using the unique option, output the previous record only when
		# we're sure we've seen all repetitions of it
		if option.unique and record == previous_record:
			repetition_count += 1
			continue
		elif option.unique:
			if previous_record:
				previous_record.attributes["repetition_count"] = str(repetition_count)
				print FastaRecord(str(previous_record).replace("\t", "|"), previous_ref_seq)
			repetition_count = 1
			previous_record = record

		if record.seqname.startswith("chr"):
			chr = record.seqname
		else:
			chr = "chr" + record.seqname
		
		ref_seq = twobit_file[chr][(record.start - 1):record.end]

		if flank != 0:
			# calculate the flanks (these variables are 0-based)
			left_flank_start = record.start - flank - 1
			left_flank_end = record.start - 1
			if left_flank_start < 0:
				left_flank_start = 0
			
			right_flank_start = record.end
			right_flank_end = record.end + flank
			
			# now find them
			left_flank_seq = twobit_file[chr][left_flank_start:left_flank_end]
			right_flank_seq = twobit_file[chr][right_flank_start:right_flank_end]
			ref_seq = left_flank_seq + "\n\n" + ref_seq + "\n\n" + right_flank_seq
		
		if option.strand and record.strand == "-":
			ref_seq = reverse_complement(ref_seq)
		
		# we don't output the current record if we're using the unique option
		if option.unique:
			previous_ref_seq = ref_seq
		else:
			print FastaRecord(str(record).replace("\t", "|"), ref_seq)
	
	# we'll have one last record yet to output if we used the unique option
	if option.unique:
		previous_record.attributes["repetition_count"] = str(repetition_count)
		print FastaRecord(str(previous_record).replace("\t", "|"), previous_ref_seq)