def test_anhadir_id_lista(): lista_test = ["id_1", "id_1", "id_2", "id_3"] with pytest.raises(SystemExit) as pytest_error: Parse.anhadir_id_lista("id_1", lista_test) assert pytest_error.type == SystemExit Parse.anhadir_id_lista("id_4", lista_test) assert lista_test[len(lista_test) - 1] == "id_4"
class main: try: if len(sys.argv) < 2: url = raw_input("Please enter URL: ") if ' ' in url: raise Exception("InputError") parseObj = Parse(url) else: url = sys.argv[1] parseObj = Parse(url) except ValueError as e: print(e) sys.exit() title = parseObj.getTitle() keyword = parseObj.getKeyword() content = parseObj.getParsedContent() content = content[-20:] header = parseObj.getParsedHeader(title, keyword) header = header[-10:] analyzer = mergeBag(header, content, 3) analyzer = sortBag(analyzer) analyzer = list(analyzer[-8:]) analyzer.reverse() print("\nWebpage: " + url) print('\nKeywords:') for w in analyzer: print(w[0]) print('')
def translate_directory(filename, root, output_file): input_file = open(root + "/" + filename, "r") for ln in input_file: line = ln.strip() if not Parse.ignore_line(line): # check if we ignore line or not translated = Parse.parse_line(line, filename) output_file.write(translated) input_file.close()
def get_result(self, text): sents, res = [], [] parsed_text = utils.call_stanfardnlp_parse(text) parsed_sentences = Parse.get_parsed_text(parsed_text.split('\n')) for parsed_sentence in parsed_sentences: root = Parse.encode_tree(parsed_sentence) sents.append(root.to_strings()) temp = self.traverse_check(root) res.append(temp) return sents, res
def translate_vm_file(file): input_file = open(file, "r") file_name = path.basename(input_file.name)[:FILE_NAME_LAST_INDEX] output_file = open(file[:FILE_NAME_LAST_INDEX] + ".asm", "w") for ln in input_file: line = ln.strip() if not Parse.ignore_line(line): # check if we ignore line or not translated = Parse.parse_line(line, file_name) output_file.write(translated) input_file.close() output_file.close()
def update_news(bot, job): """ Обновление новостей в БД :param bot: :param job: :return: void """ topics = Parse.parse_topics(URL) for t in topics: Parse.parse_one_doc_to_set_topic_time(t) topics = DB.remain_need_to_update_topics(topics) topics = Parse.parse_docs(topics) DB.update_DB(topics)
def test_comprobar_existencia_dependencias(): lista_id = ["foo", "id_1", "prueba"] tareaBien = Tarea.Tarea("root", "raiz", ["foo", "id_1"], datetime.datetime(2021, 1, 1), 30, ["foo", "id_1"]) tareaMal = Tarea.Tarea("root", "raiz", ["error", "nop"], datetime.datetime(2021, 1, 1), 30, ["error", "nop"]) try: Parse.comprobar_existencia_dependencias(tareaBien, lista_id) except SystemExit: pytest.fail() with pytest.raises(SystemExit) as pytest_error: Parse.comprobar_existencia_dependencias(tareaMal, lista_id) assert pytest_error.type == SystemExit
def transfer_from_online_to_db(): dl_count = 0 files = get_files_to_dl() dl_start = datetime.datetime.now() for file in files: url, index = Parse.get_info_from_file_name(file) dl_file(url) data = Parse.get_data_from_txt_file(index) MySQL.insert_data_for_day(data) os.remove("data.txt") dl_count += 1 print( f"downloaded {dl_count} at {(datetime.datetime.now() - dl_start) / dl_count} per dl" )
def do_parse(infile): """ this is the main parsing module """ preprocess(infile) #okay (sen, doc) = extract_features() java_classpath = 'Tools/grmm/class:Tools/grmm/lib/mallet-deps.jar:Tools/grmm/lib/grmm-deps.jar' java_prog = 'edu.umass.cs.mallet.grmm.learning.AcrfForTestJoty' if sen == "yes": apply_sent_model("tmp_sen.feat", java_classpath, java_prog) if doc == "yes": apply_doc_model("tmp_doc.feat", java_classpath, java_prog) Parse.parse('parse_sen.rel', 'tmp_sen.prob', "tmp_sen.feat", "tmp_sen.dis", 'parse_doc.rel', 'tmp_doc.prob', "tmp_doc.feat", "tmp_doc.dis", "tmp.edu")
def translate_directory(filename, root, output_file): """ translate all vm files in directory to one asm file. :param filename: name of file :param root: path :param output_file: translated vm to asm file :return: None """ input_file = open(root + os.sep + filename, "r") for ln in input_file: line = ln.strip() if not Parse.ignore_line(line): # check if we ignore line or not translated = Parse.parse_line(line, filename) output_file.write(translated) input_file.close()
def test_cast_duracion(): assert Parse.cast_duracion("20") == 20 assert Parse.cast_duracion("20d") == 20 assert Parse.cast_duracion("20w") == 140 assert Parse.cast_duracion("20m") == 600 assert Parse.cast_duracion("2y") == 730 with pytest.raises(SystemExit) as pytest_error: Parse.cast_duracion("fooy") assert pytest_error.type == SystemExit with pytest.raises(SystemExit) as pytest_error: Parse.cast_duracion("y123") assert pytest_error.type == SystemExit
def main(): print(colored("\n#####################################################",'green')) print(colored(" Welcome to Team-15 DataBase Management System ",'green')) print(colored("#####################################################\n",'green')) userLoginSignUp() query = "" database="" queryProcessor=qp.QueryProcessor() while not query.lower() == "quit": query=input(constants.InputQuery) if "use" in query.lower(): Parse.Parse.newDB=True db_raw=re.compile(r'use\s(.*)\s*',re.IGNORECASE).findall(query) database=db_raw[0] query=input() else: database query_type = Parse.Parse(database,query,queryProcessor) val = query_type.check_query() print("\n#####################################################") if val == -1: print(colored("Incorrect Query",'red')) elif val == 0: break print("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Thanks!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
def __init__(self): self.__stroage = Storeage.Stroage() self.__chrome_options = webdriver.ChromeOptions() # self.__chrome_options.add_argument('--proxy-server=%s' % PROXY) self.__chrome_options.add_experimental_option( "prefs", {'profile.default_content_setting_values.images': 2}) self.__parse = Parse.Parse()
def main(filename): global tokentuple Scan.init(filename) while True: raw_input("Press Enter") print (textline, indent, tokens) = Scan.readLine() tree, rest = Parse.parseCOMMANDLINE(tokens) print "tree =", tree print "what's left over:", rest prop2 = nnfOf(tree[1]) print "- shifted inwards:" print prop2 print prop3 = nfOf("or", "and", prop2) print "cnf:" print prop3 print prop4 = flatten("or", "and", prop3) print "flattened cnf:" print prop4 print prop5 = removeDuplicates(prop4) print "no duplicates:", prop5 prop6 = removeOpposites(prop5) print "simplified cnf:" print prop6 for clause in prop6: print clause Scan.quit()
def insertAppend(C6, v, e) : """appends e to the end of array/list v in the heap. Does the same actions as an insertAssign to an indexed array, but preserves more heap info since the append does not produce any aliases within v params : C6; v - a vartee; e - an etree """ sigma = C6["store"] heap = C6["heap"] vname = v[1] vold = Parse.makeOldVar(v) if lookupType(C6, vname) != "array" : error("cannot append to a non-list/array") else : loc = PE.peToTuple(sigma[vname]) length = heap[loc][0] newlength = PE.add(length, PE.make(1)) vector = heap[loc][1] # assign original to v_old: sigma[vold[1]] = sigma[vname] # make copy for the new value of v: copy = {} for k in vector : copy[k] = vector[k] newloc = PE.make(PE.makeSym()) rhs = PE.evall(C6, e) copy[ PE.peToTuple(length) ] = rhs sigma[vname] = newloc heap[ PE.peToTuple(newloc) ] = (newlength, copy)
def goSequent() : """goSequent helps a user interactively type a sequent to be proved, formats it in cnf (where the goal prop is negated), and writes it as a string to a textfile. The string has the format, D1 D2 ... Dn, where D ::= [F1, F2, ... Fm] F ::= "p" | "-p" where p is a string of letters Example: the input, p->r, q->r |- (p | q) -> r, is mapped to the cnf form, [['-p', 'r'], ['-q', 'r'], ['p', 'q'], ['-r']] and the string, [-p,r][-q,r][p,q][-r] is written to the output file that the user requests. """ import Parse premises = True answer = [] while premises: text = raw_input("Type premise (or |-): ").strip() if text == "|-" : premises = False else : prop = cnf( Parse.parse(Parse.scan(text)) ) answer = answer + prop text = raw_input("Type goal prop: ") not_text = "-(" + text + ")" not_goal = cnf( Parse.parse(Parse.scan(not_text)) ) answer = answer + not_goal print "clauses are:", answer print filename = raw_input("Type name of destination file: ") output = open(filename, "w") textline = "" for clause in answer : textline = textline + "[" items = "" for literal in clause : items = items + "," + literal if items != "" : items = items[1:] # forget leading comma textline = textline + items + "]" print "wrote to", filename + ":", textline output.write(textline) output.close
def main(): in_put = "Input.txt" out_put = "Output.txt" with open(in_put, "r") as file: str = file.read() result = Parse.process(str) with open(out_put, "w") as file: file.write(result)
def translate_vm_file(file): """ translate onlt one vm file to asm file :param file: name of file :return: None """ input_file = open(file, "r") file_name = path.basename(input_file.name)[:FILE_NAME_LAST_INDEX] output_file = open(file[:FILE_NAME_LAST_INDEX] + ".asm", "w") output_file.write(Translator.write_init()) for ln in input_file: line = ln.strip() if not Parse.ignore_line(line): # check if we ignore line or not translated = Parse.parse_line(line, file_name) output_file.write(translated) input_file.close() output_file.close()
def execute(self, edit, file): Norme.header(file.header) Norme.includes(file.includes) if len(file.functions) > 5: file.errors.append(Parse.Error("NBFUNCS", "", "FILE")) for function in file.functions: Norme.function(function) self.show_errors(file)
def search(posting_path, query, stemmer, query_source_path, list_of_language, list_of_city, semantic): Parse.set_stop_words_file(posting_path + "/stop_words.txt") list_save_queries = Parse.parse_queries(query_source_path, posting_path, query, stemmer, semantic) res = {} for query_post in list_save_queries: fileName = posting_path + "/" + query_post + ".pkl" file = open(fileName, "rb+") querie_term_dictionary = pickle.load(file) file.close() os.remove(fileName) query_name = query_post.replace('post', "") res[query_name] = Ranker.rank(posting_path, stemmer, querie_term_dictionary, list_of_language, list_of_city) return res
def translate_file(file): """ Translate file to hack binary code :param file: file to translate to Hack binary code :return: None """ input_file = open(file, "r") # change extension to .hack output_file = open(file[:FILE_NAME_LAST_INDEX] + ".hack", "w") first_pass(input_file) # first pass input_file.seek(0) for ln in input_file: # second pass line = ln.strip() if not Parse.ignore_line(line) and not line.startswith("("): in_binary = Parse.parse_line(line) output_file.write(in_binary + '\n') input_file.close() output_file.close()
def matchDef(C6, btree) : """attempts to locate a defn saved in C6["defs"] that matches the assert, btree. Returns whether or not there was success. """ for scheme in C6["defs"] : success = Parse.match({}, btree, scheme) if success : return True # else, no match: return False
def insertAssign(C6, v, etree): """updates the store of C6 with an assignment. If v already exists in C6's store, saves former value as v_old for later use in proof reasoning. params: v - has form, ["var", s] or ["index", ["var", s], etree] etree - another etree, to be assigned to the var. """ sigma = C6["store"] heap = C6["heap"] badvars = C6["novars"] if v[0] == "var" : vtree = v elif v[0] == "index" : vtree = v[1] vold = Parse.makeOldVar(vtree) # ["var", vname_old] # first, check if we are allowed to update v: if (vtree in badvars) : error("you may not update a protected global var outside of its maintenance function") return # if possible, rename current value of var v as v_old: if v[0] == "var" and v[1] in sigma : # and lookupType(C6, v[1]) != "array": sigma[vold[1]] = sigma[v[1]] # assign v's current value to v_old elif v[0] == "index" and lookupType(C6, v[1][1]) == "array": vname = v[1][1] loc = PE.peToTuple(sigma[vname]) length = heap[loc][0] vector = heap[loc][1] # make copy: copy = {} for k in vector : copy[k] = vector[k] # assign original to v_old and copy to v : sigma[vold[1]] = sigma[vname] newloc = PE.make(PE.makeSym()) sigma[vname] = newloc heap[ PE.peToTuple(newloc) ] = (length, copy) # (later, vold will be erased from sigma....) # now, eval assignment's rhs and store it into v: rhs = PE.evall(C6, etree) if v[0] == "var": # simple var sigma[v[1]] = rhs elif v[0] == "index": # an array/list reference # eval index expression (NOTE: no nested indexing allowed): indexpe = PE.evall(C6, v[2]) # save values in sigma[vname][1] provably distinct from vname[index]: vname = v[1][1] if vname not in sigma or lookupType(C6, vname) != "array" : error(vname + " is not an array in the store") #sigma[vname] = PE.makeArray() else : vmap = heap[PE.peToTuple(sigma[vname])][1] saveDistinctElements(C6, vmap, indexpe) vmap[PE.peToTuple(indexpe)] = rhs
def num_valid_valuations(formula): formula_tree = Parse.parse(formula) num_valid = 0 for v in all_valuations(atoms(formula_tree)): if eval(formula_tree, v): num_valid += 1 return num_valid
def test_eval_complex(self): self.assertFalse(Eval.eval(Parse.parse("a /\\ ~(a <=> a)"), {"a": False})) self.assertFalse(Eval.eval(Parse.parse("a /\\ ~(a <=> a)"), {"a": True})) self.assertTrue(Eval.eval(Parse.parse("(Smoke => Fire) => (~Smoke => ~Fire)"), {"Smoke": False, "Fire": False})) self.assertFalse(Eval.eval(Parse.parse("(Smoke => Fire) => (~Smoke => ~Fire)"), {"Smoke": False, "Fire": True})) self.assertTrue(Eval.eval(Parse.parse("(Smoke => Fire) => (~Smoke => ~Fire)"), {"Smoke": True, "Fire": False})) self.assertTrue(Eval.eval(Parse.parse("(Smoke => Fire) => (~Smoke => ~Fire)"), {"Smoke": True, "Fire": True})) self.assertTrue(Eval.eval(Parse.parse("a /\\ (b \\/ c)"), {"a": True, "b": False, "c": True})) self.assertFalse(Eval.eval(Parse.parse("(a /\\ b) \\/ c"), {"a": True, "b": False, "c": False}))
def score_hand(hand, winning_tile, tsumo, ron, seat, prevalent_wind, first_turn, riichi, concealed, first_round, ippatsu, last_tile, d): """ Symbols: - Coins: [cn] - Bamboo: [bn] - Characters: [kn] - Winds: - East: [we] - South: [ws] - West: [ww] - North: [wn] - Dragons: - Red: [dr] - Green: [dg] - White: [dw] Note: For suits replace 'n' with the value of the tile. For red fives, replace 'n' with 'r' Note: Spaces between tiles may be used to help readability. Example Hand: '[c1][c2][c3] [dr][dr][dr] [we][we][we] [k1][k2][k3] [br][b5]' Hand Contents: - Chow of coins 1-3 - Pung of Red Dragons - Pung of East Winds - Chow of Characters 1-3 - Eyes of Bamboo 5 and red 5 """ tiles = Parse.parse_hand(hand) winning = Parse.parse_winning_tile(winning_tile) count = Counter(tiles) if d: for k in count.keys(): print("%s %s" % (count[k], k)) del count print("Winning Tile: %s" % winning) print() Fan.calculate_fan(tiles, winning, tsumo, ron, seat, prevalent_wind, first_turn, riichi, concealed, first_round, ippatsu, last_tile, d)
def get_bot_code(code): file = urllib.request.urlopen("https://raw.githubusercontent.com/XenthisX/bot/master/bots/bot.clj").read() print(code) if (code == 'js'): file = urllib.request.urlopen("https://raw.githubusercontent.com/erichahn/wombats-python/master/bots/bottest.js") extension = "bottest.js" elif(code == 'py'): file = urllib.request.urlopen("https://raw.githubusercontent.com/erichahn/wombats-python/master/bots/bottest.py") extension = "bottest.py" return Parse.run_command(extension, file.read())
def jsonOutputTextOnly(toExport, filename): l = [] for tab in toExport: d = {} d['id'] = tab['idCDLI'] d['text'] = Parse.getFullText(tab) l.append(d) f = open(filename, 'w') json.dump(l, f) f.close()
def insertGlobalInvariant(C6, inv): """adds inv, a btree, to C6's globals list of global invariants # adds to gvar all vtrees (vars) embedded within inv; adds global invariant to rels and facts, since it is now established, and we are not allowed to change any of the vars it mentions (except from within a fcn that declars the vars as globals. finally, ``locks'' the global vars from updates, by placing their names on the ``novars'' list. Only functions that mention the vars in its ``globals'' clause can update the global vars """ C6["globalinvs"] = C6["globalinvs"] + [inv] globals_in_inv = Parse.removeDuplicates(Parse.extractVarsFrom(inv)) C6["novars"] = C6["novars"] + globals_in_inv #erase(C6, globals_in_inv) reset(C6, globals_in_inv)
def keyword(line): regex = re.compile("(\sif\s?)+|(\swhile\s?)+|(\sreturn\s?)+") regex_space = re.compile("(\sif\s)+|(\swhile\s)+|(\sreturn\s)+") res = regex.match(line.text) if res: res_text = res.group(0) last_res = regex_space.match(res_text) if not last_res: line.errors.append(Parse.Error("NOSPCKEY", res_text, "LINE", line)) return
def parse_document(self): # extract all relevant document fields via the parser parser = Parse.Parse(self.constants, self.stop_words, self.is_stemming) parser.parse_document(self.content) self.length = parser.position self.max_tf = parser.max_tf self.docno = parser.docno self.terms = parser.terms del parser self.num_of_words = len(self.terms)
def includes(includes): sys = True sys_regex = re.compile("#\s*include\s*<[\s\S]+?>") usr_regex = re.compile("#\s*include\s*\"[\s\S]+?\"") for include in includes.lines: if sys_regex.match(include.text): if not sys: include.errors.append(Parse.Error("WRGLOCINCL", include.text, "INC", include)) elif usr_regex.match(include.text): if sys: sys = False
def on_all_valuations(formula): ''' Use eval for all valuations. Do all return true? ''' formula_tree = Parse.parse(formula) for v in all_valuations(atoms(formula_tree)): if not eval(formula_tree, v): return False return True
def ReadFromFile(fileName): fs = open(fileName, 'r') n = int(fs.readline()) type = fs.readline() type = type.rstrip('\n') k = 0 equations = [] for i in range(0, n): equations.append(fs.readline()) temp = [] IntialGuesses = [] if type == 'seidel' or type == "All": temp = fs.read().split(" ") IntialGuesses = [int(numeric_string) for numeric_string in temp] if type == "All": Parse.Parse(equations, "LU", n, [], 0, 0) Parse.Parse(equations, "Gaussian-jordan", n, [], 0, 0) Parse.Parse(equations, "Gaussian-elimination", n, [], 0, 0) Parse.Parse(equations, "seidel", n, IntialGuesses, 50, 0.00001) if type == 'seidel': Parse.Parse(equations, type, n, IntialGuesses, 50, 0.00001)
def function(function): inside_func = False count_lines = 0 for line in function.lines: if line.text == "{": inside_func = True elif line.text == "}": inside_func = False elif inside_func: count_lines += 1 Norme.line(line) if count_lines > 25: function.errors.append(Parse.Error("NBFUNCLNS", "", "FUNC"))
def lookupGlobalInvariants(C6, varnames): """finds in C6 all global invariants relevant to varnames: params: C6, varnames - a sequence of vtrees returns: a list of all the invariants saved in C6 that mention any variable within varnames """ invlist = [] for ginv in C6["globalinvs"] : foundmes = map(lambda v: Parse.foundIn(v, ginv), varnames) if True in foundmes : #invlist = invlist + (ginv,) invlist.append(ginv) return invlist
def beginTransaction(self): print("\n--------------------------------------------------------") print("tranasction started") query = "" while not query.lower() == "quit": query = input() query_type = prs.Parse(self.database, query, self.queryProcessor) val = query_type.check_query() if val == -1: print(colored("Incorrect Query", 'red')) elif val == 0: break print("transaction ended") print("\n--------------------------------------------------------")
def CreateTree(file, trees): image = Image.open(file) name = os.path.basename(file) mask = Image.open("masks/" + name + ".png") width = image.size[0] height = image.size[1] pix = mask.load() print(name) new_image = image.copy() new_mask = mask.copy() new_image_pixels = new_image.load() new_mask_pixels = new_mask.load() for j in range(height): for i in range(width): if pix[i, j] == (255, 0, 0, 255): (blob, pix) = GT.eat(i, j, pix) (w, h, w_min, h_min) = GT.get_round_size(blob) new_tree = GetTree(trees, w, h) tree_pix = new_tree.load() (new_image_pixels, new_mask_pixels) = DrawTree(new_image_pixels, new_mask_pixels, tree_pix, w, h, w_min, h_min) (new_mask_pixels, new_blob) = Separate(new_mask_pixels, (w // 2) + w_min, (h // 2) + h_min) for k in range(len(new_blob)): new_mask_pixels[new_blob[k][0], new_blob[k][1]] = (0, 0, 0, 255) blurredimg = new_image.copy() blurredimg = blurredimg.filter(ImageFilter.GaussianBlur(10)) blurredomg_pix = blurredimg.load() for j in range(height): for i in range(width): if new_mask_pixels[i, j] == (255, 0, 0, 255): new_mask_pixels[i, j] = (255, 255, 255, 255) new_image_pixels[i, j] = blurredomg_pix[i, j] return (new_image, new_mask)
def decrypt(encrypteddata,destination): header = encrypteddata[:7] encrypteddata = encrypteddata[7:] if refreshed == True: if destination == "client": decrypteddata = stc.decrypt(decrypteddata) msgid, version, paylen = parseheader(decrypteddata) parsdata = Parse.packparse(msgid, decrypteddata) elif destination == "server": decrypteddata = cts.decrypt(decrypteddata) msgid, version, paylen = parseheader(decrypteddata) parsdata = Parse.packparse(msgid, decrypteddata) elif refreshed == False: if destination == "client": decrypteddata = lstc.decrypt(decrypteddata) msgid, version, paylen = parseheader(decrypteddata) parsedata = Parse.packparse(msgid, decrypteddata) elif destination == "server": decrypteddata = lcts.decrypt(decrypteddata) msgid, version, paylen = parseheader(decrypteddata) parsedata = Parse.packparse(msgid, decrypteddata) return decrypteddata
def main1(): dat = util.get_labeled_questions(str("data/nt-13588_2.tsv"), "data") fLog = sys.stdout for i, qinfo in enumerate(dat, 1): if qinfo.seq_qid[-1] != '0': parse = Parse() parse.type = Parse.FollowUp cond = Condition(3, Condition.OpEqRow, 7) parse.conditions = [cond] pred = parse.run(qinfo, resinfo) fLog.write("(%s) %s\n" % (qinfo.seq_qid, qinfo.question)) fLog.write("Answer: %s\n" % ", ".join( ["(%d,%d)" % coord for coord in qinfo.answer_coordinates])) fLog.write("Predictions: %s\n" % ", ".join(["(%d,%d)" % coord for coord in pred])) fLog.write("\n") fLog.flush() # use the gold answers resinfo = util.ResultInfo(qinfo.seq_qid, qinfo.question, qinfo.ques_word_sequence, qinfo.answer_coordinates, qinfo.answer_column_idx)
def __init__(self, parent, id): super(PipeWindow, self).__init__(parent) self.widget = QWidget() self.layout = QGridLayout(self.widget) self.book = Parse.Sizing() self.setCentralWidget(self.widget) #elements in the PipeWindow self.widget_format() self.setWindowTitle(str(id)) self.show()
def first_pass(input_file): """ Scan the entire file for labels, e.g (xxx), add the pair (xxx, address) to the symbols table, where address is the number of the instruction following (xxx) :param input_file: file to translate to Hack binary code :return: None """ i = 0 # set counter for line instructions for ln in input_file: ln = ln.strip() if not Parse.ignore_line(ln) and not ln.startswith("("): i += 1 # counter for label symbols value key = ln[1:-1] # remove bracket from label if ln.startswith("("): Code_tables.symbols_table[key] = i
def disasmCommands(co_code): """ @param co_code: bytecode. @return: array of L{Command} class instances. """ commands = [] i = 0 border = len(co_code) while i < border: offset = i opcode = struct.unpack("=B", co_code[i])[0] i += 1 name = None argument = None if opcode in Opcodes.opcodes: name = Opcodes.opcodes[opcode][0] if Opcodes.opcodes[opcode][1] != 0: argument = Parse.getInt(co_code[i : i + Opcodes.opcodes[opcode][1]]) i += Opcodes.opcodes[opcode][1] commands.append(Command(offset, opcode, name, argument)) return commands
def RunMerge(args): cfg = Parse.generate_merge_cfg(args) Parse.print_merge_options(cfg) if not cfg['debug']: logging.disable(logging.CRITICAL) regions_df = pd.read_table(cfg['region_file'], compression='gzip' if cfg['region_file'].split('.')[-1] == 'gz' else None) regions_df = regions_df[regions_df['job'] == int(cfg['job'])].reset_index(drop=True) return_values = {} print '' try: bgzfile = bgzf.BgzfWriter(cfg['out'] + '.gz', 'wb') except: print Process.Error("failed to initialize bgzip format out file " + cfg['out'] + '.gz').out return 1 if cfg['cpus'] > 1: pool = mp.Pool(cfg['cpus']-1) for i in xrange(1,cfg['cpus']): return_values[i] = pool.apply_async(process_regions, args=(regions_df,cfg,i,True,)) print "submitting job on cpu " + str(i) + " of " + str(cfg['cpus']) pool.close() print "executing job for cpu " + str(cfg['cpus']) + " of " + str(cfg['cpus']) + " via main process" main_return = process_regions(regions_df,cfg,cfg['cpus'],True) pool.join() if 1 in [return_values[i].get() for i in return_values] or main_return == 1: print Process.Error("error detected, see log files").out return 1 else: main_return = process_regions(regions_df,cfg,1,True) if main_return == 1: print Process.Error("error detected, see log files").out return 1 for i in xrange(1,cfg['cpus']+1): try: logfile = open(cfg['out'] + '.cpu' + str(i) + '.log', 'r') except: print Process.Error("failed to initialize log file " + cfg['out'] + '.cpu' + str(i) + '.log').out return 1 print logfile.read() logfile.close() os.remove(cfg['out'] + '.cpu' + str(i) + '.log') written = False for i in xrange(1,cfg['cpus']+1): out = '/'.join(cfg['out'].split('/')[0:-1]) + '/' + cfg['out'].split('/')[-1] + '.cpu' + str(i) + '.pkl' pkl = open(out,"rb") results_final,results_header = pickle.load(pkl) if not written: bgzfile.write('#' + '\t'.join(results_header) + '\n') written = True if results_final.shape[0] > 0: results_final.replace({'None': 'NA', 'nan': 'NA'}).to_csv(bgzfile, index=False, sep='\t', header=False, na_rep='NA', float_format='%.5g', columns = results_header, append=True) pkl.close() os.remove(out) bgzfile.close() print "indexing out file" try: pysam.tabix_index(cfg['out'] + '.gz',seq_col=0,start_col=1,end_col=1,force=True) except: print Process.Error('failed to generate index for file ' + cfg['out'] + '.gz').out return 1 if cfg['snpeff']: from ConfigParser import SafeConfigParser from pkg_resources import resource_filename import subprocess import xlsxwriter import time ini = SafeConfigParser() ini.read(resource_filename('uga', 'settings.ini')) results_final = pd.read_table(cfg['out'] + '.gz') outdf = results_final[['#chr','pos','id','a1','a2']] outdf = outdf.rename(columns={'#chr':'#CHROM','pos':'POS','id':'ID','a1':'REF','a2':'ALT'}) outdf['QUAL'] = None outdf['FILTER'] = None outdf['INFO'] = None outdf.to_csv(cfg['out'] + '.annot1',header=True, index=False, sep='\t') time.sleep(1) try: cmd = 'java -jar ' + ini.get('main','snpeff') + ' -s ' + cfg['out'] + '.annot.summary.html -v -canon GRCh37.75 ' + cfg['out'] + '.annot1 > ' + cfg['out'] + '.annot2' print cmd p = subprocess.Popen(cmd,shell=True) p.wait() except KeyboardInterrupt: kill_all(p.pid) print "canonical annotation process terminated by user" sys.exit(1) return time.sleep(1) try: cmd = 'java -jar ' + ini.get('main','snpsift') + ' extractFields -s "," -e "NA" ' + cfg['out'] + '.annot2 CHROM POS ID REF ALT "ANN[*].ALLELE" "ANN[*].EFFECT" "ANN[*].IMPACT" "ANN[*].GENE" "ANN[*].GENEID" "ANN[*].FEATURE" "ANN[*].FEATUREID" "ANN[*].BIOTYPE" "ANN[*].RANK" "ANN[*].HGVS_C" "ANN[*].HGVS_P" "ANN[*].CDNA_POS" "ANN[*].CDNA_LEN" "ANN[*].CDNA_LEN" "ANN[*].CDS_POS" "ANN[*].CDS_LEN" "ANN[*].AA_POS" "ANN[*].AA_LEN" "ANN[*].DISTANCE" "ANN[*].ERRORS" | sed "s/ANN\[\*\]/ANN/g" > ' + cfg['out'] + '.annot' print cmd p = subprocess.Popen(cmd,shell=True) p.wait() except KeyboardInterrupt: kill_all(p.pid) print "SnpSift annotation process terminated by user" sys.exit(1) os.remove(cfg['out'] + '.annot1') os.remove(cfg['out'] + '.annot2') results_final = results_final.rename(columns={'#chr':'#CHROM','pos':'POS','id':'ID','a1':'REF','a2':'ALT'}) annot = pd.read_table(cfg['out'] + '.annot') out = results_final.merge(annot,how='outer') out.fillna('NA',inplace=True) wkbk = xlsxwriter.Workbook(cfg['out'] + '.annot.xlsx') wksht = wkbk.add_worksheet() header_format = wkbk.add_format({'bold': True, 'align': 'center', 'valign': 'vcenter'}) string_format = wkbk.add_format({'align': 'center', 'valign': 'center'}) float_format = wkbk.add_format({'align': 'center', 'valign': 'center'}) float_format.set_num_format('0.000') integer_format = wkbk.add_format({'align': 'center', 'valign': 'center'}) integer_format.set_num_format('0') sci_format = wkbk.add_format({'align': 'center', 'valign': 'center'}) sci_format.set_num_format('0.00E+00') i = 0 for field in out.columns: wksht.write(0,i,field,header_format) i += 1 i = 0 for row in range(out.shape[0]): j = 0 for field in out.columns: if field in ['#CHROM','POS'] or field.endswith('.filtered') or field.endswith('.n'): wksht.write(row+1,j,out[field][i], integer_format) elif field.endswith(('.p','hwe','hwe.unrel')): wksht.write(row+1,j,out[field][i], sci_format) elif field.endswith(('.effect','.stderr','.or','.z','freq','freq.unrel','rsq','rsq.unrel','callrate')): wksht.write(row+1,j,out[field][i], float_format) else: wksht.write(row+1,j,out[field][i], string_format) j += 1 i += 1 wksht.freeze_panes(1, 0) wkbk.close() os.remove(cfg['out'] + '.annot') print "process complete" return 0
def RunFilter(args): cfg = Parse.generate_filter_cfg(args) Parse.print_filter_options(cfg) if not cfg['debug']: logging.disable(logging.CRITICAL) print '' print "loading file header" try: handle=pysam.TabixFile(filename=cfg['file'],parser=pysam.asTuple()) except: print Process.Error("unable to load file header").out return 1 header = [x for x in handle.header] cols = header[-1].split() found = True if not cfg['pcol'] in cols: print Process.Error("p-value column, --pcol, not found").out found = False if not cfg['bpcol'] in cols: print Process.Error("genomic position column, --bpcol, not found").out found = False if cfg['miss'] is not None and not cfg['misscol'] in cols: print Process.Error("callrate column, --misscol, not found; required for --miss option").out found = False if cfg['maf'] is not None and not cfg['freqcol'] in cols: print Process.Error("allele frequency column, --freqcol, not found; required for --maf option").out found = False if cfg['mac'] is not None and not cfg['maccol'] in cols: print Process.Error("minor allele count column, --maccol, not found; required for --mac option").out found = False if cfg['cmac'] is not None and not cfg['cmaccol'] in cols: print Process.Error("cumulative minor allele count column, --cmaccol, not found; required for --cmac option").out found = False if cfg['rsq'] is not None and not cfg['rsqcol'] in cols: print Process.Error("imputation quality (rsq) column, --rsqcol, not found; required for --rsq option").out found = False if cfg['hwe'] is not None and not cfg['hwecol'] in cols: print Process.Error("Hardy Weinberg p-value column, --hwecol, not found; required for --hwe option").out found = False if cfg['hwe_maf'] is not None and (not cfg['hwecol'] in cols or not cfg['freqcol'] in cols): print Process.Error("either Hardy Weinberg p-value or allele frequency column, --hwecol or --freqcol, not found; both required for --hwe-maf option").out found = False if not found: return 1 print "reading data from file" skip_rows = len(header)-1 cols = header[-1].split() try: r = pd.read_table(cfg['file'],sep='\t',skiprows=skip_rows,compression='gzip') except: print Process.Error("unable to read data from file " + cfg['file']).out return 1 r = r.loc[~ np.isnan(r[cfg['pcol']])] print str(r.shape[0]) + " results found with valid p-values" nsnps = r.shape[0] if cfg['miss'] is not None: r = r.loc[r[cfg['misscol']] >= cfg['miss']] if cfg['maf'] is not None: r = r.loc[(r[cfg['freqcol']] >= cfg['maf']) & (r[cfg['freqcol']] <= 1-cfg['maf'])] if cfg['mac'] is not None: r = r.loc[r[cfg['maccol']] >= cfg['mac']] if cfg['cmac'] is not None: r = r.loc[r[cfg['cmaccol']] >= cfg['cmac']] if cfg['rsq'] is not None: r = r.loc[(~ np.isnan(r[cfg['rsqcol']])) & (r[cfg['rsqcol']] >= cfg['rsq'])] if cfg['hwe'] is not None: if cfg['hwe_maf'] is not None: r = r.loc[(~ np.isnan(r[cfg['hwecol']])) & (~ np.isnan(r[cfg['freqcol']])) & (~ (r[cfg['freqcol']] >= cfg['hwe_maf']) & (r[cfg['hwecol']] < cfg['hwe']))] else: r = r.loc[(~ np.isnan(r[cfg['hwecol']])) & (r[cfg['hwecol']] >= cfg['hwe'])] print str(r.shape[0]) + " results remain after filtering, " + str(nsnps - r.shape[0]) + " removed" if cfg['gc']: l = np.median(scipy.chi2.ppf([1-x for x in r.loc[~ np.isnan(r[cfg['pcol']]),cfg['pcol']].tolist()], df=1))/scipy.chi2.ppf(0.5,1) print "genomic inflation = " + str(l) if cfg['stderrcol'] in r.columns: print "adjusting stderr" r[cfg['stderrcol']] = r[cfg['stderrcol']] * math.sqrt(l) if cfg['waldcol'] in r.columns: print "adjusting wald statistic" r[cfg['waldcol']] = r[cfg['waldcol']] / math.sqrt(l) print "calculating corrected p-value from wald statistic" r[cfg['pcol']] = scipy.chisqprob(r[cfg['waldcol']],1) elif cfg['zcol'] in r.columns: print "adjusting z statistic" r[cfg['zcol']] = r[cfg['zcol']] / math.sqrt(l) print "calculating corrected p-value from z statistic" r[cfg['pcol']] = 2 * scipy.norm.cdf(-1 * np.abs(r[cfg['zcol']])) elif cfg['effectcol'] in r.columns and cfg['stderrcol'] in r.columns: print "calculating corrected p-value from effect and stderr using a calculated z statistic" r[cfg['pcol']] = 2 * scipy.norm.cdf(-1 * np.abs(r[cfg['effectcol']]) / r[cfg['stderrcol']]) else: print "calculating corrected p-value from existing p-value using an estimated z statistic" r[cfg['pcol']] = 2 * scipy.norm.cdf(-1 * np.abs(scipy.norm.ppf(0.5*r[cfg['pcol']]) / math.sqrt(l))) print "writing filtered results to file" try: bgzfile = bgzf.BgzfWriter(cfg['file'].replace('.gz','.' + cfg['tag'] + '.gz'), 'wb') except: print Process.Error("unable to initialize out file " + cfg['file'].replace('.gz','.' + cfg['tag'] + '.gz')).out return 1 bgzfile.write('\n'.join([x for x in handle.header]) + '\n') r[cols].to_csv(bgzfile,header=False,index=False,sep="\t",na_rep='NA', float_format='%.5g') bgzfile.close() handle.close() print "indexing out file" try: pysam.tabix_index(cfg['file'].replace('.gz','.' + cfg['tag'] + '.gz'),seq_col=0,start_col=r.columns.get_loc(cfg['bpcol']),end_col=r.columns.get_loc(cfg['bpcol']),force=True) except: print Process.Error('failed to generate index for file ' + cfg['file'].replace('.gz','.' + cfg['tag'] + '.gz')).out return 1 print "process complete" return 0
def RunSnvplot(args): cfg = Parse.generate_snvplot_cfg(args) Parse.print_snvplot_options(cfg) if not cfg['debug']: logging.disable(logging.CRITICAL) ro.r('suppressMessages(library(ggplot2))') ro.r('suppressMessages(library(grid))') handle=pysam.TabixFile(filename=cfg['file'],parser=pysam.asVCF()) header = [x for x in handle.header] skip_rows = len(header)-1 cols = header[-1].split() pcols = cfg['pcol'].split(',') cols_extract = [cfg['chrcol'],cfg['bpcol']] + pcols if cfg['qq_strat_freq']: if cfg['freqcol'] not in cols: print Process.Error("frequency column " + cfg['freqcol'] + " not found, unable to proceed with frequency stratified plots").out return 1 else: cols_extract = cols_extract + [cfg['freqcol']] print "frequency column " + cfg['freqcol'] + " found" if cfg['qq_strat_mac']: if cfg['maccol'] not in cols: print Process.Error("minor allele count column " + cfg['maccol'] + " not found, unable to proceed with minor allele count stratified plots").out return 1 else: cols_extract = cols_extract + [cfg['maccol']] print "minor allele count column " + cfg['maccol'] + " found" print "importing data" r = pd.read_table(cfg['file'],sep='\t',skiprows=skip_rows,usecols=cols_extract,compression='gzip') print str(r.shape[0]) + " total variants found" for pcol in pcols: print "plotting p-values for column " + pcol + " ..." results = r[[cfg['chrcol'],cfg['bpcol'],cfg['freqcol'],pcol]] if cfg['freqcol'] in r else r[[cfg['chrcol'],cfg['bpcol'],pcol]] results.dropna(inplace=True) results = results[(results[pcol] > 0) & (results[pcol] <= 1)].reset_index(drop=True) print " " + str(results.shape[0]) + " variants with plottable p-values" results['logp'] = -1 * np.log10(results[pcol]) + 0.0 ro.globalenv['results'] = results l = np.median(scipy.chi2.ppf([1-x for x in results[pcol].tolist()], df=1))/scipy.chi2.ppf(0.5,1) # in R: median(qchisq(results$p, df=1, lower.tail=FALSE))/qchisq(0.5,1) print " genomic inflation (all variants) = " + str(l) if cfg['qq']: print " generating standard qq plot" print " minimum p-value: " + str(np.min(results[pcol])) a = -1 * np.log10(ro.r('ppoints(' + str(len(results.index)) + ')')) a.sort() results.sort_values(by=['logp'], inplace=True) print " maximum -1*log10(p-value): " + str(np.max(results['logp'])) ci_upper = -1 * np.log10(scipy.beta.ppf(0.95, range(1,len(results[pcol]) + 1), range(len(results[pcol]),0,-1))) ci_upper.sort() ci_lower = -1 * np.log10(scipy.beta.ppf(0.05, range(1,len(results[pcol]) + 1), range(len(results[pcol]),0,-1))) ci_lower.sort() ro.globalenv['df'] = ro.DataFrame({'a': ro.FloatVector(a), 'b': ro.FloatVector(results['logp']), 'ci_lower': ro.FloatVector(ci_lower), 'ci_upper': ro.FloatVector(ci_upper)}) dftext_label = 'lambda %~~% ' + str(l) ro.globalenv['dftext'] = ro.DataFrame({'x': ro.r('Inf'), 'y': 0.5, 'lab': dftext_label}) if cfg['ext'] == 'tiff': ggsave = 'ggsave(filename="%s",plot=pp,width=4,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq.tiff') elif cfg['ext'] == 'eps': ggsave = 'ggsave(filename="%s",plot=pp,width=4,height=4,bg="white",horizontal=True)' % (cfg['out'] + '.' + pcol + '.qq.eps') else: ggsave = 'ggsave(filename="%s",plot=pp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq.pdf') ro.r(""" gp<-ggplot(df) pp<-gp + aes_string(x='a',y='b') + geom_ribbon(aes_string(x='a',ymin='ci_lower',ymax='ci_upper'), data=df, alpha=0.25, fill='black') + geom_point(size=2) + geom_abline(intercept=0, slope=1, alpha=0.5) + scale_x_discrete(expression(Expected~~-log[10](italic(p)))) + scale_y_discrete(expression(Observed~~-log[10](italic(p)))) + coord_fixed() + theme_bw(base_size = 12) + geom_text(aes_string(x='x', y='y', label='lab'), data = dftext, colour="black", vjust=0, hjust=1, size = 4, parse=TRUE) + theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), legend.position = 'none', panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.text = element_text(size=12)) %s """ % (ggsave)) if np.max(results['logp']) > cfg['crop']: print " generating cropped standard qq plot" ro.r('df$b[df$b > ' + str(cfg['crop']) + ']<-' + str(cfg['crop'])) ro.r('df$shape<-0') ro.r('df$shape[df$b == ' + str(cfg['crop']) + ']<-1') if cfg['ext'] == 'tiff': ggsave = 'ggsave(filename="%s",plot=pp,width=4,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq.cropped.tiff') elif cfg['ext'] == 'eps': ggsave = 'ggsave(filename="%s",plot=pp,width=4,height=4,bg="white",horizontal=True)' % (cfg['out'] + '.' + pcol + '.qq.cropped.eps') else: ggsave = 'ggsave(filename="%s",plot=pp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq.cropped.pdf') ro.r(""" gp<-ggplot(df) pp<-gp + aes_string(x='a',y='b') + geom_ribbon(aes_string(x='a',ymin='ci_lower',ymax='ci_upper'), data=df, alpha=0.25, fill='black') + geom_point(aes(shape=factor(shape)),size=2) + geom_abline(intercept=0, slope=1, alpha=0.5) + scale_x_discrete(expression(Expected~~-log[10](italic(p)))) + scale_y_discrete(expression(Observed~~-log[10](italic(p)))) + coord_fixed() + theme_bw(base_size = 12) + geom_text(aes_string(x='x', y='y', label='lab'), data = dftext, colour="black", vjust=0, hjust=1, size = 4, parse=TRUE) + theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), legend.position = 'none', panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.text = element_text(size=12)) %s """ % (ggsave)) if cfg['qq_strat_freq']: print " generating frequency stratified qq plot" strat_ticks = [0.005, 0.01, 0.03, 0.05] results['UGA___QQ_BIN___'] = 'E' results.loc[(results[cfg['freqcol']] >= 0.01) & (results[cfg['freqcol']] <= 0.99),'UGA___QQ_BIN___'] = 'D' results.loc[(results[cfg['freqcol']] >= 0.03) & (results[cfg['freqcol']] <= 0.97),'UGA___QQ_BIN___'] = 'C' results.loc[(results[cfg['freqcol']] >= 0.05) & (results[cfg['freqcol']] <= 0.95),'UGA___QQ_BIN___'] = 'B' results.loc[(results[cfg['freqcol']] >= 0.1) & (results[cfg['freqcol']] <= 0.9),'UGA___QQ_BIN___'] = 'A' lA='NA' lB='NA' lC='NA' lD='NA' lE='NA' lE_n=len(results[pcol][(results[cfg['freqcol']] < 0.01) | (results[cfg['freqcol']] > 0.99)]) lD_n=len(results[pcol][((results[cfg['freqcol']] >= 0.01) & (results[cfg['freqcol']] < 0.03)) | ((results[cfg['freqcol']] <= 0.99) & (results[cfg['freqcol']] > 0.97))]) lC_n=len(results[pcol][((results[cfg['freqcol']] >= 0.03) & (results[cfg['freqcol']] < 0.05)) | ((results[cfg['freqcol']] <= 0.97) & (results[cfg['freqcol']] > 0.95))]) lB_n=len(results[pcol][((results[cfg['freqcol']] >= 0.05) & (results[cfg['freqcol']] < 0.1)) | ((results[cfg['freqcol']] <= 0.95) & (results[cfg['freqcol']] > 0.9))]) lA_n=len(results[pcol][(results[cfg['freqcol']] >= 0.1) & (results[cfg['freqcol']] <= 0.9)]) if lE_n > 0: lE=np.median(scipy.chi2.ppf([1-x for x in results[pcol][(results[cfg['freqcol']] < 0.01) | (results[cfg['freqcol']] > 0.99)].tolist()], df=1))/scipy.chi2.ppf(0.5,1) if lD_n > 0: lD=np.median(scipy.chi2.ppf([1-x for x in results[pcol][((results[cfg['freqcol']] >= 0.01) & (results[cfg['freqcol']] < 0.03)) | ((results[cfg['freqcol']] <= 0.99) & (results[cfg['freqcol']] > 0.97))].tolist()], df=1))/scipy.chi2.ppf(0.5,1) if lC_n > 0: lC=np.median(scipy.chi2.ppf([1-x for x in results[pcol][((results[cfg['freqcol']] >= 0.03) & (results[cfg['freqcol']] < 0.05)) | ((results[cfg['freqcol']] <= 0.97) & (results[cfg['freqcol']] > 0.95))].tolist()], df=1))/scipy.chi2.ppf(0.5,1) if lB_n > 0: lB=np.median(scipy.chi2.ppf([1-x for x in results[pcol][((results[cfg['freqcol']] >= 0.05) & (results[cfg['freqcol']] < 0.1)) | ((results[cfg['freqcol']] <= 0.95) & (results[cfg['freqcol']] > 0.9))].tolist()], df=1))/scipy.chi2.ppf(0.5,1) if lA_n > 0: lA=np.median(scipy.chi2.ppf([1-x for x in results[pcol][(results[cfg['freqcol']] >= 0.1) & (results[cfg['freqcol']] <= 0.9)].tolist()], df=1))/scipy.chi2.ppf(0.5,1) print " genomic inflation (MAF >= 10%, n=" + str(lA_n) + ") = " + str(lA) print " genomic inflation (5% <= MAF < 10%, n=" + str(lB_n) + ") = " + str(lB) print " genomic inflation (3% <= MAF < 5%, n=" + str(lC_n) + ") = " + str(lC) print " genomic inflation (1% <= MAF < 3%, n=" + str(lD_n) + ") = " + str(lD) print " genomic inflation (MAF < 1%, n=" + str(lE_n) + ") = " + str(lE) a = np.array([]) b = np.array([]) c = np.array([]) results.sort_values(by=['logp'], inplace=True) if len(results[results['UGA___QQ_BIN___'] == 'E'].index) > 0: aa = -1 * np.log10(ro.r('ppoints(' + str(len(results[results['UGA___QQ_BIN___'] == 'E'].index)) + ')')) aa.sort() bb = results['logp'][results['UGA___QQ_BIN___'] == 'E'] #bb.sort() cc = results['UGA___QQ_BIN___'][results['UGA___QQ_BIN___'] == 'E'] a = np.append(a,aa) b = np.append(b,bb) c = np.append(c,cc) print " minimum p-value (MAF < 1%): " + str(np.min(results[pcol][results['UGA___QQ_BIN___'] == 'E'])) print " maximum -1*log10(p-value) (MAF < 1%): " + str(np.max(results['logp'][results['UGA___QQ_BIN___'] == 'E'])) if len(results[results['UGA___QQ_BIN___'] == 'D'].index) > 0: aa = -1 * np.log10(ro.r('ppoints(' + str(len(results[results['UGA___QQ_BIN___'] == 'D'].index)) + ')')) aa.sort() bb = results['logp'][results['UGA___QQ_BIN___'] == 'D'] #bb.sort() cc = results['UGA___QQ_BIN___'][results['UGA___QQ_BIN___'] == 'D'] a = np.append(a,aa) b = np.append(b,bb) c = np.append(c,cc) print " minimum p-value (1% <= MAF < 3%): " + str(np.min(results[pcol][results['UGA___QQ_BIN___'] == 'D'])) print " maximum -1*log10(p-value) (1% <= MAF < 3%): " + str(np.max(results['logp'][results['UGA___QQ_BIN___'] == 'D'])) if len(results[results['UGA___QQ_BIN___'] == 'C'].index) > 0: aa = -1 * np.log10(ro.r('ppoints(' + str(len(results[results['UGA___QQ_BIN___'] == 'C'].index)) + ')')) aa.sort() bb = results['logp'][results['UGA___QQ_BIN___'] == 'C'] #bb.sort() cc = results['UGA___QQ_BIN___'][results['UGA___QQ_BIN___'] == 'C'] a = np.append(a,aa) b = np.append(b,bb) c = np.append(c,cc) print " minimum p-value (3% <= MAF < 5%): " + str(np.min(results[pcol][results['UGA___QQ_BIN___'] == 'C'])) print " maximum -1*log10(p-value) (3% <= MAF < 5%): " + str(np.max(results['logp'][results['UGA___QQ_BIN___'] == 'C'])) if len(results[results['UGA___QQ_BIN___'] == 'B'].index) > 0: aa = -1 * np.log10(ro.r('ppoints(' + str(len(results[results['UGA___QQ_BIN___'] == 'B'].index)) + ')')) aa.sort() bb = results['logp'][results['UGA___QQ_BIN___'] == 'B'] #bb.sort() cc = results['UGA___QQ_BIN___'][results['UGA___QQ_BIN___'] == 'B'] a = np.append(a,aa) b = np.append(b,bb) c = np.append(c,cc) print " minimum p-value (5% <= MAF < 10%): " + str(np.min(results[pcol][results['UGA___QQ_BIN___'] == 'B'])) print " maximum -1*log10(p-value) (5% <= MAF < 10%): " + str(np.max(results['logp'][results['UGA___QQ_BIN___'] == 'B'])) if len(results[results['UGA___QQ_BIN___'] == 'A'].index) > 0: aa = -1 * np.log10(ro.r('ppoints(' + str(len(results[results['UGA___QQ_BIN___'] == 'A'].index)) + ')')) aa.sort() bb = results['logp'][results['UGA___QQ_BIN___'] == 'A'] #bb.sort() cc = results['UGA___QQ_BIN___'][results['UGA___QQ_BIN___'] == 'A'] a = np.append(a,aa) b = np.append(b,bb) c = np.append(c,cc) print " minimum p-value (MAF >= 10%): " + str(np.min(results[pcol][results['UGA___QQ_BIN___'] == 'A'])) print " maximum -1*log10(p-value) (MAF >= 10%): " + str(np.max(results['logp'][results['UGA___QQ_BIN___'] == 'A'])) ro.globalenv['df'] = ro.DataFrame({'a': ro.FloatVector(a), 'b': ro.FloatVector(b), 'UGA___QQ_BIN___': ro.StrVector(c)}) if cfg['ext'] == 'tiff': ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq_strat_freq.tiff') elif cfg['ext'] == 'eps': ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white",horizontal=True)' % (cfg['out'] + '.' + pcol + '.qq_strat_freq.eps') else: ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq_strat_freq.pdf') ro.r(""" gp<-ggplot(df, aes_string(x='a',y='b')) + geom_point(aes_string(color='UGA___QQ_BIN___'), size=2) + scale_colour_manual(values=c("E"="#a8ddb5", "D"="#7bccc4", "C"="#4eb3d3", "B"="#2b8cbe", "A"="#08589e"), labels=c("E"="MAF < 1%%","D"="1%% <= MAF < 3%%","C"="3%% <= MAF < 5%%","B"="5%% <= MAF < 10%%","A"="MAF >= 10%%")) + geom_abline(intercept=0, slope=1, alpha=0.5) + scale_x_discrete(expression(Expected~~-log[10](italic(p)))) + scale_y_discrete(expression(Observed~~-log[10](italic(p)))) + coord_fixed() + theme_bw(base_size = 12) + theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), legend.title = element_blank(), legend.key.height = unit(0.1,"in"), legend.text = element_text(size=5), legend.key = element_blank(), legend.justification = c(0,1), legend.position = c(0,1), panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.text = element_text(size=12)) %s """ % (ggsave)) if np.max(results['logp']) > cfg['crop']: print " generating cropped frequency stratified qq plot" ro.r('df$b[df$b > ' + str(cfg['crop']) + ']<-' + str(cfg['crop'])) ro.r('df$shape<-0') ro.r('df$shape[df$b == ' + str(cfg['crop']) + ']<-1') if cfg['ext'] == 'tiff': ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq_strat_freq.cropped.tiff') elif cfg['ext'] == 'eps': ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white",horizontal=True)' % (cfg['out'] + '.' + pcol + '.qq_strat_freq.cropped.eps') else: ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq_strat_freq.cropped.pdf') ro.r(""" gp<-ggplot(df, aes_string(x='a',y='b')) + geom_point(aes(shape=factor(shape), color=UGA_MAF), size=2) + scale_colour_manual(values=c("E"="#a8ddb5", "D"="#7bccc4", "C"="#4eb3d3", "B"="#2b8cbe", "A"="#08589e"), labels=c("E"="MAF < 1%%","D"="1%% <= MAF < 3%%","C"="3%% <= MAF < 5%%","B"="5%% <= MAF < 10%%","A"="MAF >= 10%%")) + geom_abline(intercept=0, slope=1, alpha=0.5) + scale_x_discrete(expression(Expected~~-log[10](italic(p)))) + scale_y_discrete(expression(Observed~~-log[10](italic(p)))) + coord_fixed() + theme_bw(base_size = 12) + guides(shape=FALSE) + theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), legend.title = element_blank(), legend.key.height = unit(0.1,"in"), legend.text = element_text(size=5), legend.key = element_blank(), legend.justification = c(0,1), legend.position = c(0,1), panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.text = element_text(size=12)) %s """ % (ggsave)) #if cfg['qq_strat_mac']: # print " generating minor allele count stratified qq plot" # # results['UGA_MAC'] = 'E' # results.loc[results[cfg['maccol']] < 5),'UGA_MAC'] = 'D' # results.loc[(results[cfg['maccol']] >= 0.03) & (results[cfg['maccol']] <= 0.97),'UGA_MAC'] = 'C' # results.loc[(results[cfg['maccol']] >= 0.05) & (results[cfg['maccol']] <= 0.95),'UGA_MAC'] = 'B' # results.loc[(results[cfg['maccol']] >= 0.1) & (results[cfg['maccol']] <= 0.9),'UGA_MAC'] = 'A' # lA='NA' # lB='NA' # lC='NA' # lD='NA' # lE='NA' # lE_n=len(results[pcol][(results[cfg['maccol']] < 0.01) | (results[cfg['maccol']] > 0.99)]) # lD_n=len(results[pcol][((results[cfg['maccol']] >= 0.01) & (results[cfg['maccol']] < 0.03)) | ((results[cfg['maccol']] <= 0.99) & (results[cfg['maccol']] > 0.97))]) # lC_n=len(results[pcol][((results[cfg['maccol']] >= 0.03) & (results[cfg['maccol']] < 0.05)) | ((results[cfg['maccol']] <= 0.97) & (results[cfg['maccol']] > 0.95))]) # lB_n=len(results[pcol][((results[cfg['maccol']] >= 0.05) & (results[cfg['maccol']] < 0.1)) | ((results[cfg['maccol']] <= 0.95) & (results[cfg['maccol']] > 0.9))]) # lA_n=len(results[pcol][(results[cfg['maccol']] >= 0.1) & (results[cfg['maccol']] <= 0.9)]) # if lE_n > 0: # lE=np.median(scipy.chi2.ppf([1-x for x in results[pcol][(results[cfg['maccol']] < 0.01) | (results[cfg['maccol']] > 0.99)].tolist()], df=1))/scipy.chi2.ppf(0.5,1) # if lD_n > 0: # lD=np.median(scipy.chi2.ppf([1-x for x in results[pcol][((results[cfg['maccol']] >= 0.01) & (results[cfg['maccol']] < 0.03)) | ((results[cfg['maccol']] <= 0.99) & (results[cfg['maccol']] > 0.97))].tolist()], df=1))/scipy.chi2.ppf(0.5,1) # if lC_n > 0: # lC=np.median(scipy.chi2.ppf([1-x for x in results[pcol][((results[cfg['maccol']] >= 0.03) & (results[cfg['maccol']] < 0.05)) | ((results[cfg['maccol']] <= 0.97) & (results[cfg['maccol']] > 0.95))].tolist()], df=1))/scipy.chi2.ppf(0.5,1) # if lB_n > 0: # lB=np.median(scipy.chi2.ppf([1-x for x in results[pcol][((results[cfg['maccol']] >= 0.05) & (results[cfg['maccol']] < 0.1)) | ((results[cfg['maccol']] <= 0.95) & (results[cfg['maccol']] > 0.9))].tolist()], df=1))/scipy.chi2.ppf(0.5,1) # if lA_n > 0: # lA=np.median(scipy.chi2.ppf([1-x for x in results[pcol][(results[cfg['maccol']] >= 0.1) & (results[cfg['maccol']] <= 0.9)].tolist()], df=1))/scipy.chi2.ppf(0.5,1) # print " genomic inflation (MAF >= 10%, n=" + str(lA_n) + ") = " + str(lA) # print " genomic inflation (5% <= MAF < 10%, n=" + str(lB_n) + ") = " + str(lB) # print " genomic inflation (3% <= MAF < 5%, n=" + str(lC_n) + ") = " + str(lC) # print " genomic inflation (1% <= MAF < 3%, n=" + str(lD_n) + ") = " + str(lD) # print " genomic inflation (MAF < 1%, n=" + str(lE_n) + ") = " + str(lE) # # a = np.array([]) # b = np.array([]) # c = np.array([]) # results.sort_values(by=['logp'], inplace=True) # if len(results[results['UGA_MAC'] == 'E'].index) > 0: # aa = -1 * np.log10(ro.r('ppoints(' + str(len(results[results['UGA_MAC'] == 'E'].index)) + ')')) # aa.sort() # bb = results['logp'][results['UGA_MAC'] == 'E'] # #bb.sort() # cc = results['UGA_MAC'][results['UGA_MAC'] == 'E'] # a = np.append(a,aa) # b = np.append(b,bb) # c = np.append(c,cc) # print " minimum p-value (MAF < 1%): " + str(np.min(results[pcol][results['UGA_MAC'] == 'E'])) # print " maximum -1*log10(p-value) (MAF < 1%): " + str(np.max(results['logp'][results['UGA_MAC'] == 'E'])) # if len(results[results['UGA_MAC'] == 'D'].index) > 0: # aa = -1 * np.log10(ro.r('ppoints(' + str(len(results[results['UGA_MAC'] == 'D'].index)) + ')')) # aa.sort() # bb = results['logp'][results['UGA_MAC'] == 'D'] # #bb.sort() # cc = results['UGA_MAC'][results['UGA_MAC'] == 'D'] # a = np.append(a,aa) # b = np.append(b,bb) # c = np.append(c,cc) # print " minimum p-value (1% <= MAF < 3%): " + str(np.min(results[pcol][results['UGA_MAC'] == 'D'])) # print " maximum -1*log10(p-value) (1% <= MAF < 3%): " + str(np.max(results['logp'][results['UGA_MAC'] == 'D'])) # if len(results[results['UGA_MAC'] == 'C'].index) > 0: # aa = -1 * np.log10(ro.r('ppoints(' + str(len(results[results['UGA_MAC'] == 'C'].index)) + ')')) # aa.sort() # bb = results['logp'][results['UGA_MAC'] == 'C'] # #bb.sort() # cc = results['UGA_MAC'][results['UGA_MAC'] == 'C'] # a = np.append(a,aa) # b = np.append(b,bb) # c = np.append(c,cc) # print " minimum p-value (3% <= MAF < 5%): " + str(np.min(results[pcol][results['UGA_MAC'] == 'C'])) # print " maximum -1*log10(p-value) (3% <= MAF < 5%): " + str(np.max(results['logp'][results['UGA_MAC'] == 'C'])) # if len(results[results['UGA_MAC'] == 'B'].index) > 0: # aa = -1 * np.log10(ro.r('ppoints(' + str(len(results[results['UGA_MAC'] == 'B'].index)) + ')')) # aa.sort() # bb = results['logp'][results['UGA_MAC'] == 'B'] # #bb.sort() # cc = results['UGA_MAC'][results['UGA_MAC'] == 'B'] # a = np.append(a,aa) # b = np.append(b,bb) # c = np.append(c,cc) # print " minimum p-value (5% <= MAF < 10%): " + str(np.min(results[pcol][results['UGA_MAC'] == 'B'])) # print " maximum -1*log10(p-value) (5% <= MAF < 10%): " + str(np.max(results['logp'][results['UGA_MAC'] == 'B'])) # if len(results[results['UGA_MAC'] == 'A'].index) > 0: # aa = -1 * np.log10(ro.r('ppoints(' + str(len(results[results['UGA_MAC'] == 'A'].index)) + ')')) # aa.sort() # bb = results['logp'][results['UGA_MAC'] == 'A'] # #bb.sort() # cc = results['UGA_MAC'][results['UGA_MAC'] == 'A'] # a = np.append(a,aa) # b = np.append(b,bb) # c = np.append(c,cc) # print " minimum p-value (MAF >= 10%): " + str(np.min(results[pcol][results['UGA_MAC'] == 'A'])) # print " maximum -1*log10(p-value) (MAF >= 10%): " + str(np.max(results['logp'][results['UGA_MAC'] == 'A'])) # # ro.globalenv['df'] = ro.DataFrame({'a': ro.FloatVector(a), 'b': ro.FloatVector(b), 'UGA_MAC': ro.StrVector(c)}) # # if cfg['ext'] == 'tiff': # ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq_strat.tiff') # elif cfg['ext'] == 'eps': # ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white",horizontal=True)' % (cfg['out'] + '.' + pcol + '.qq_strat.eps') # else: # ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq_strat.pdf') # ro.r(""" # gp<-ggplot(df, aes_string(x='a',y='b')) + # geom_point(aes_string(color='UGA_MAC'), size=2) + # scale_colour_manual(values=c("E"="#a8ddb5", "D"="#7bccc4", "C"="#4eb3d3", "B"="#2b8cbe", "A"="#08589e"), labels=c("E"="MAF < 1%%","D"="1%% <= MAF < 3%%","C"="3%% <= MAF < 5%%","B"="5%% <= MAF < 10%%","A"="MAF >= 10%%")) + # geom_abline(intercept=0, slope=1, alpha=0.5) + # scale_x_continuous(expression(Expected~~-log[10](italic(p)))) + # scale_y_continuous(expression(Observed~~-log[10](italic(p)))) + # theme_bw(base_size = 12) + # theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), legend.title = element_blank(), # legend.key.height = unit(0.1,"in"), legend.text = element_text(size=5), legend.key = element_blank(), legend.justification = c(0,1), # legend.position = c(0,1), panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), # panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.text = element_text(size=12)) # %s # """ % (ggsave)) # # if np.max(results['logp']) > cfg['crop']: # print " generating cropped frequency stratified qq plot" # ro.r('df$b[df$b > ' + str(cfg['crop']) + ']<-' + str(cfg['crop'])) # ro.r('df$shape<-0') # ro.r('df$shape[df$b == ' + str(cfg['crop']) + ']<-1') # if cfg['ext'] == 'tiff': # ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq_strat.cropped.tiff') # elif cfg['ext'] == 'eps': # ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white",horizontal=True)' % (cfg['out'] + '.' + pcol + '.qq_strat.cropped.eps') # else: # ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq_strat.cropped.pdf') # ro.r(""" # gp<-ggplot(df, aes_string(x='a',y='b')) + # geom_point(aes(shape=factor(shape), color=UGA_MAC), size=2) + # scale_colour_manual(values=c("E"="#a8ddb5", "D"="#7bccc4", "C"="#4eb3d3", "B"="#2b8cbe", "A"="#08589e"), labels=c("E"="MAF < 1%%","D"="1%% <= MAF < 3%%","C"="3%% <= MAF < 5%%","B"="5%% <= MAF < 10%%","A"="MAF >= 10%%")) + # geom_abline(intercept=0, slope=1, alpha=0.5) + # scale_x_continuous(expression(Expected~~-log[10](italic(p)))) + # scale_y_continuous(expression(Observed~~-log[10](italic(p)))) + # theme_bw(base_size = 12) + # guides(shape=FALSE) + # theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), legend.title = element_blank(), # legend.key.height = unit(0.1,"in"), legend.text = element_text(size=5), legend.key = element_blank(), legend.justification = c(0,1), # legend.position = c(0,1), panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), # panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.text = element_text(size=12)) # %s # """ % (ggsave)) if cfg['mht']: print " generating standard manhattan plot" print " minimum p-value: " + str(np.min(results[pcol])) print " maximum -1*log10(p-value): " + str(np.max(results['logp'])) if cfg['gc'] and l > 1: print " adjusting p-values for genomic inflation for p-value column " + pcol results[pcol]=2 * scipy.norm.cdf(-1 * np.abs(scipy.norm.ppf(0.5*results[pcol]) / math.sqrt(l))) print " minimum post-gc adjustment p-value: " + str(np.min(results[pcol])) print " maximum post-gc adjustment -1*log10(p-value): " + str(np.max(results['logp'])) else: print " skipping genomic inflation correction" print " calculating genomic positions" results.sort_values(by=[cfg['chrcol'],cfg['bpcol']], inplace=True) ticks = [] lastbase = 0 results['gpos'] = 0 nchr = len(list(np.unique(results[cfg['chrcol']].values))) chrs = np.unique(results[cfg['chrcol']].values) if cfg['color']: colours = ["#08306B","#41AB5D","#000000","#F16913","#3F007D","#EF3B2C","#08519C","#238B45","#252525","#D94801","#54278F","#CB181D","#2171B5","#006D2C","#525252","#A63603","#6A51A3","#A50F15","#4292C6","#00441B","#737373","#7F2704","#807DBA","#67000D"] else: colours = ["#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3"] if nchr == 1: results['gpos'] = results[cfg['bpcol']] results['colours'] = "#08589e" if results['gpos'].max() - results['gpos'].min() <= 1000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 100 == 0] elif results['gpos'].max() - results['gpos'].min() <= 10000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 1000 == 0] elif results['gpos'].max() - results['gpos'].min() <= 100000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 10000 == 0] elif results['gpos'].max() - results['gpos'].min() <= 200000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 20000 == 0] elif results['gpos'].max() - results['gpos'].min() <= 300000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 30000 == 0] elif results['gpos'].max() - results['gpos'].min() <= 400000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 40000 == 0] elif results['gpos'].max() - results['gpos'].min() <= 500000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 50000 == 0] elif results['gpos'].max() - results['gpos'].min() <= 600000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 60000 == 0] elif results['gpos'].max() - results['gpos'].min() <= 700000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 70000 == 0] elif results['gpos'].max() - results['gpos'].min() <= 800000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 80000 == 0] elif results['gpos'].max() - results['gpos'].min() <= 900000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 90000 == 0] elif results['gpos'].max() - results['gpos'].min() <= 1000000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 100000 == 0] elif results['gpos'].max() - results['gpos'].min() <= 10000000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 1000000 == 0] elif results['gpos'].max() - results['gpos'].min() <= 100000000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 10000000 == 0] elif results['gpos'].max() - results['gpos'].min() > 100000000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 25000000 == 0] else: results['colours'] = "#000000" for i in range(len(chrs)): print " processed chromosome " + str(int(chrs[i])) if i == 0: results.loc[results[cfg['chrcol']] == chrs[i],'gpos'] = results.loc[results[cfg['chrcol']] == chrs[i],cfg['bpcol']] else: lastbase = lastbase + results.loc[results[cfg['chrcol']] == chrs[i-1],cfg['bpcol']].iloc[-1] results.loc[results[cfg['chrcol']] == chrs[i],'gpos'] = (results.loc[results[cfg['chrcol']] == chrs[i],cfg['bpcol']]) + lastbase if results.loc[results[cfg['chrcol']] == chrs[i]].shape[0] > 1: ticks.append(results.loc[results[cfg['chrcol']] == chrs[i],'gpos'].iloc[0] + (results.loc[results[cfg['chrcol']] == chrs[i],'gpos'].iloc[-1] - results.loc[results[cfg['chrcol']] == chrs[i],'gpos'].iloc[0])/2) else: ticks.append(results.loc[results[cfg['chrcol']] == chrs[i],'gpos'].iloc[0]) results.loc[results[cfg['chrcol']] == chrs[i],'colours'] = colours[int(chrs[i])] results['logp'] = -1 * np.log10(results[pcol]) if results.shape[0] >= 1000000: sig = 5.4e-8 else: sig = 0.05 / results.shape[0] print " significance level set to p-value = " + str(sig) + " (-1*log10(p-value) = " + str(-1 * np.log10(sig)) + ")" print " " + str(len(results[pcol][results[pcol] <= sig])) + " genome wide significant variants" chr = results[cfg['chrcol']][0] maxy=int(max(np.ceil(-1 * np.log10(sig)),np.ceil(results['logp'].max()))) if maxy > 20: y_breaks = range(0,maxy,5) y_labels = range(0,maxy,5) else: y_breaks = range(0,maxy) y_labels = range(0,maxy) ro.globalenv['df'] = ro.DataFrame({'gpos': ro.FloatVector(results['gpos']), 'logp': ro.FloatVector(results['logp']), 'colours': ro.FactorVector(results['colours'])}) ro.globalenv['ticks'] = ro.FloatVector(ticks) ro.globalenv['labels'] = ro.Vector(["{:,}".format(x/1000) for x in ticks]) ro.globalenv['colours'] = ro.StrVector(colours) ro.globalenv['chrs'] = ro.FloatVector(chrs) print " generating manhattan plot" if cfg['ext'] == 'tiff': ggsave = 'ggsave(filename="%s",plot=gp,width=16,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.mht.tiff') elif cfg['ext'] == 'eps': ggsave = 'ggsave(filename="%s",plot=gp,width=16,height=4,bg="white",horizontal=True)' % (cfg['out'] + '.' + pcol + '.mht.eps') else: ggsave = 'ggsave(filename="%s",plot=gp,width=16,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.mht.pdf') if nchr == 1: ro.r(""" gp<-ggplot(df, aes_string(x='gpos',y='logp')) + geom_hline(yintercept = -1 * log10(%g),colour="#B8860B", linetype=5, size = 0.25) + geom_point(size=1.5) + scale_x_continuous(expression(Chromosome~~%d~~(kb))'),breaks=ticks,labels=labels) + \ scale_y_continuous(expression(-log[10](italic(p))),breaks=seq(0,%d,1),limits=c(0,%d)) + \ theme_bw(base_size = 8) + \ theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.title = element_text(size=10), axis.text = element_text(size=12), legend.position = 'none') %s """ % (sig, chr, maxy, maxy, ggsave)) else: ro.r(""" gp = ggplot(df, aes_string(x='gpos',y='logp',colour='colours')) + geom_hline(yintercept = -1 * log10(%g),colour="#B8860B", linetype=5, size = 0.25) + geom_point(size=1.5) + scale_colour_manual(values=colours) + scale_x_continuous(expression(Chromosome),breaks=ticks,labels=chrs) + scale_y_continuous(expression(-log[10](italic(p))),breaks=seq(0,%d,1),limits=c(0,%d)) + theme_bw(base_size = 8) + theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.title = element_text(size=10), axis.text = element_text(size=12), legend.position = 'none') %s """ % (sig, maxy, maxy, ggsave)) if maxy > cfg['crop']: maxy = cfg['crop'] ro.r('df$logp[df$logp > ' + str(cfg['crop']) + ']<-' + str(cfg['crop'])) ro.r('df$shape<-0') ro.r('df$shape[df$logp == ' + str(cfg['crop']) + ']<-1') print " generating cropped manhattan plot" if cfg['ext'] == 'tiff': ggsave = 'ggsave(filename="%s",plot=gp,width=16,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.mht.cropped.tiff') elif cfg['ext'] == 'eps': ggsave = 'ggsave(filename="%s",plot=gp,width=16,height=4,bg="white",horizontal=True)' % (cfg['out'] + '.' + pcol + '.mht.cropped.eps') else: ggsave = 'ggsave(filename="%s",plot=gp,width=16,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.mht.cropped.pdf') if nchr == 1: ro.r(""" gp<-ggplot(df, aes_string(x='gpos',y='logp')) + geom_hline(yintercept = -1 * log10(%g),colour="#B8860B", linetype=5, size = 0.25) + geom_point(aes(shape=factor(shape)),size=1.5) + scale_x_continuous(expression(Chromosome~~%d~~(kb))'),breaks=ticks,labels=labels) + scale_y_continuous(expression(-log[10](italic(p))),breaks=seq(0,%d,1),limits=c(0,%d)) + theme_bw(base_size = 8) + theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.title = element_text(size=10), axis.text = element_text(size=12), legend.position = 'none') %s """ % (sig, chr, maxy, maxy, ggsave)) else: ro.r(""" gp = ggplot(df, aes_string(x='gpos',y='logp',colour='colours')) + geom_hline(yintercept = -1 * log10(%g),colour="#B8860B", linetype=5, size = 0.25) + geom_point(aes(shape=factor(shape)),size=1.5) + scale_colour_manual(values=colours) + scale_x_continuous(expression(Chromosome),breaks=ticks,labels=chrs) + scale_y_continuous(expression(-log[10](italic(p))),breaks=seq(0,%d,1),limits=c(0,%d)) + theme_bw(base_size = 8) + theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.title = element_text(size=8), axis.text = element_text(size=12), legend.position = 'none') %s """ % (sig, maxy, maxy, ggsave)) print "process complete" return 0
def main(): print("Test") Parse.run_command("bottest.js", "console.log(\"Ran bottest.js\")")
def num_invalid_valuations(formula): all_val = all_valuations(atoms(Parse.parse(formula))) return len(all_val) - num_valid_valuations(formula)
def Mpi_slave(self, result_out, buffer_size, compair): """ Slave process @param compair: Comparison object """ ar=Parse() status = MPI.Status() self.comm.send(1,dest=0, tag=1) root=Interaction() buffer=buffer_size flag_soft=True part=0 while(status.tag!=0): #Mise en sommeil pour la reduction de consommation de ressource while not self.comm.Iprobe(source=0, tag=MPI.ANY_TAG): time.sleep(0.1) data = self.comm.recv(source=0, tag=MPI.ANY_TAG,status=status) if(status.tag==1): #Cree le noeud du soft if(flag_soft): #compteur et nom du soft root.setSoft(data[1],data[4]) flag_soft=False #Lancement de la comparaison result = compair.runComparison(data[0]) #Parsing de la sortie ar.runParsing(data[1], data[2], data[3], data[5], data[6], result, root) #Renvoyer le resultat de la ligne de commande self.comm.send(1,dest=status.source, tag=1) #Decrease the buffer buffer-=1 #Vider le buffer if(buffer==0): #Envoie des resultats #self.comm.send(zlib.compress(root.getResult()), dest=1, tag=2) self.Mpi_write_data(root, result_out, data[1], data[4], self.myrank, part) #Vider le buffer del(root) #Remise a zero de l'arbre root=Interaction() #Reinitialisation du buffer buffer=buffer_size #Reinitialisation du flag soft flag_soft=True #Incrementation du numero de partie part+=1 #Changement de soft elif(status.tag==2): #Envoie des resultats #self.comm.send(zlib.compress(root.getResult()), dest=1, tag=2) #Write data if(buffer!=buffer_size): self.Mpi_write_data(root, result_out, data[0], data[1], self.myrank, part) #Reinitialisation du buffer buffer=buffer_size #Vider le buffer del(root) #Remise a zero de l'arbre root=Interaction() #Reinitialisation du flag soft flag_soft=True #Reinitialisation du numero de partie part=0
if c == NOT : opposite = prim[1:] else : opposite = NOT + prim if opposite in d : return True return False if __name__ == '__main__': #def main(): """main lets you test the algorithm with interactive input""" import Parse text = raw_input("Type a proposition: ") print prop0 = Parse.parse(Parse.scan(text)) print "parse tree: " print prop0 print prop1 = removeImplications(prop0) print "-> removed:" print prop1 print prop2 = moveNegations(prop1) print "- shifted inwards:" print prop2 print prop3 = makeIntoCNF(prop2) print "cnf:" print prop3 print
def main(args=None): rerun = [] args = Parse.get_args(Parse.get_parser()) resubmit = False if args.which in ['snv','snvgroup','meta','merge','resubmit','tools']: if args.which == 'resubmit': with open(args.dir + '/' + os.path.basename(args.dir) + '.args.pkl', 'rb') as p: qsub = args.qsub if args.qsub else None args,cfg = pickle.load(p) if qsub: cfg['qsub'] = qsub with open(cfg['out'] + '/' + os.path.basename(cfg['out']) + '.rerun', 'r') as f: rerun = [int(line.rstrip()) for line in f] cfg['replace'] = True resubmit = True else: cfg = getattr(Parse, 'generate_' + args.which + '_cfg')(args.ordered_args) elif args.which != 'settings': cfg = getattr(Parse, 'generate_' + args.which + '_cfg')(args.ordered_args) ##### read settings file ##### ini = SafeConfigParser() ini.read(resource_filename('uga', 'settings.ini')) ##### locate qsub wrapper ##### qsub_wrapper = ini.get('main','wrapper') if 'qsub' in args and not os.access(ini.get('main','wrapper'),os.X_OK): print Process.print_error('uga qsub wrapper ' + ini.get('main','wrapper') + ' is not executable') return ##### distribute jobs ##### if args.which in ['snv','snvgroup','meta','merge','tools']: run_type = 0 if cfg['cpus'] is not None and cfg['cpus'] > 1: run_type = run_type + 1 if cfg['split'] and cfg['qsub'] is not None: run_type = run_type + 10 if cfg['split_n'] and cfg['qsub'] is not None: run_type = run_type + 100 if resubmit: jobs_df = pd.read_table(cfg['out'] + '/' + cfg['out'] + '.jobs') else: if args.which in ['snv','tools']: # generate regions dataframe with M rows, either from --snv-map or by splitting data file or --snv-region according to --mb # run_type = 0: run as single job # run_type = 1: --cpus C (distribute M regions over C cpus and run single job, 1 job C cpus) # run_type = 10: --split (split M regions into single region jobs, M jobs 1 cpu) # run_type = 100: --split-n N (distribute M regions over N jobs, N jobs 1 cpu) # run_type = 11: --split, --cpus C (split M regions into chunks of size M / C and run M jobs, M jobs C cpus) # run_type = 101: --split-n N, --cpus C (distribute M regions over N jobs and distribute each over C cpus, N jobs C cpus) if cfg['region_file']: jobs_df = pd.read_table(cfg['region_file'],header=None,names=['region'], compression='gzip' if cfg['region_file'].split('.')[-1] == 'gz' else None) jobs_df['chr'] = [x.split(':')[0] for x in jobs_df['region']] jobs_df['chr_idx'] = [int(x.split(':')[0].replace('X','23').replace('Y','24')) for x in jobs_df['region']] jobs_df['start'] = [int(x.split(':')[1].split('-')[0]) for x in jobs_df['region']] jobs_df['end'] = [int(x.split(':')[1].split('-')[1]) for x in jobs_df['region']] jobs_df['job'] = 1 jobs_df['cpu'] = 1 else: snv_map = [] data_files = [] if args.which == 'snv': for m in cfg['models']: if cfg['models'][m]['file'] not in data_files: snv_map.extend(Map.map(file=cfg['models'][m]['file'], mb = cfg['mb'], region = cfg['region'])) data_files.append(cfg['models'][m]['file']) else: snv_map.extend(Map.map(file=cfg['file'], mb = cfg['mb'], region = cfg['region'])) snv_map = list(set(snv_map)) jobs_df = pd.DataFrame({'region': snv_map, 'chr': [x.split(':')[0] for x in snv_map], 'chr_idx': [int(x.split(':')[0].replace('X','23').replace('Y','24')) for x in snv_map], 'start': [int(x.split(':')[1].split('-')[0]) for x in snv_map], 'end': [int(x.split(':')[1].split('-')[1]) for x in snv_map]}) jobs_df['job'] = 1 jobs_df['cpu'] = 1 del data_files del snv_map jobs_df.sort_values(by=['chr_idx','start'],inplace=True) jobs_df = jobs_df[['chr','start','end','region','job','cpu']] jobs_df.reset_index(drop=True,inplace=True) if args.which in ['meta','merge']: # generate regions dataframe with M rows, either from --snv-map or by splitting data file or --snv-region according to --mb # run_type = 0: run as single job # run_type = 1: --cpus C (distribute M regions over C cpus and run single job, 1 job C cpus) # run_type = 10: --split (split M regions into single region jobs, M jobs 1 cpu) # run_type = 100: --split-n N (distribute M regions over N jobs, N jobs 1 cpu) # run_type = 11: --split, --cpus C (split M regions into chunks of size M / C and run M jobs, M jobs C cpus) # run_type = 101: --split-n N, --cpus C (distribute M regions over N jobs and distribute each over C cpus, N jobs C cpus) if cfg['region_file']: jobs_df = pd.read_table(cfg['region_file'],header=None,names=['region'], compression='gzip' if cfg['region_file'].split('.')[-1] == 'gz' else None) jobs_df['chr'] = [int(x.split(':')[0]) for x in jobs_df['region']] jobs_df['start'] = [int(x.split(':')[1].split('-')[0]) for x in jobs_df['region']] jobs_df['end'] = [int(x.split(':')[1].split('-')[1]) for x in jobs_df['region']] jobs_df['job'] = 1 jobs_df['cpu'] = 1 else: snv_map = [] data_files = [] for f in cfg['files']: if f not in data_files: snv_map.extend(Map.map(file=cfg['files'][f], mb = cfg['mb'], region = cfg['region'])) data_files.append(cfg['files'][f]) snv_map = list(set(snv_map)) jobs_df = pd.DataFrame({'region': snv_map, 'chr': [int(x.split(':')[0]) for x in snv_map], 'start': [int(x.split(':')[1].split('-')[0]) for x in snv_map], 'end': [int(x.split(':')[1].split('-')[1]) for x in snv_map]}) jobs_df['job'] = 1 jobs_df['cpu'] = 1 del data_files del snv_map jobs_df = jobs_df[['chr','start','end','region','job','cpu']] jobs_df.sort_values(by=['chr','start'],inplace=True) jobs_df.reset_index(drop=True,inplace=True) if args.which == 'snvgroup': # generate regions dataframe with M rows from --snvgroup-map # run_type = 0: run as single job # run_type = 1: --cpus C (distribute M snvgroups over C cpus and run single job, 1 job C cpus) # run_type = 10: --split (split M snvgroups into single region jobs, M jobs 1 cpu) # run_type = 100: --split-n N (distribute M snvgroups over N jobs, N jobs 1 cpu) # run_type = 101: --split-n N, --cpus C (distribute M snvgroups over N jobs and distribute each job over C cpus, N jobs C cpus) if cfg['region_file']: jobs_df = pd.read_table(cfg['region_file'],header=None,names=['region','group_id'], compression='gzip' if cfg['region_file'].split('.')[-1] == 'gz' else None) jobs_df['chr'] = [int(x.split(':')[0]) for x in jobs_df['region']] jobs_df['chr_idx'] = 1 jobs_df['start'] = [int(x.split(':')[1].split('-')[0]) for x in jobs_df['region']] jobs_df['end'] = [int(x.split(':')[1].split('-')[1]) for x in jobs_df['region']] jobs_df['job'] = 1 jobs_df['cpu'] = 1 jobs_df = jobs_df[['chr','start','end','region','group_id','job','cpu']] jobs_df.sort_values(by=['chr','start'],inplace=True) jobs_df.reset_index(drop=True,inplace=True) elif cfg['region']: snv_map = [] data_files = [] for m in cfg['models']: if cfg['models'][m]['file'] not in data_files: snv_map.extend(Map.map(file=cfg['models'][m]['file'], mb = 1000, region = cfg['region'])) data_files.append(cfg['models'][m]['file']) snv_map = list(set(snv_map)) jobs_df = pd.DataFrame({'region': snv_map, 'chr': [int(x.split(':')[0]) for x in snv_map], 'start': [int(x.split(':')[1].split('-')[0]) for x in snv_map], 'end': [int(x.split(':')[1].split('-')[1]) for x in snv_map]}) jobs_df['group_id'] = cfg['region'] jobs_df['job'] = 1 jobs_df['cpu'] = 1 del data_files del snv_map jobs_df = jobs_df[['chr','start','end','region','group_id','job','cpu']] jobs_df.sort_values(by=['chr','start'],inplace=True) jobs_df.reset_index(drop=True,inplace=True) else: if cfg['snvgroup_map']: snvgroup_map = pd.read_table(cfg['snvgroup_map'],header=None,names=['chr','pos','marker','group_id'], compression='gzip' if cfg['snvgroup_map'].split('.')[-1] == 'gz' else None) jobs_df = snvgroup_map[['chr','pos','group_id']] jobs_df=jobs_df.groupby(['chr','group_id']) jobs_df = jobs_df.agg({'pos': [np.min,np.max]}) jobs_df.columns = ['start','end'] jobs_df['chr'] = jobs_df.index.get_level_values('chr') jobs_df['group_id'] = jobs_df.index.get_level_values('group_id') jobs_df['region'] = jobs_df.chr.map(str) + ':' + jobs_df.start.map(str) + '-' + jobs_df.end.map(str) jobs_df['job'] = 1 jobs_df['cpu'] = 1 jobs_df = jobs_df[['chr','start','end','region','group_id','job','cpu']] jobs_df.drop_duplicates(inplace=True) jobs_df.sort_values(by=['chr','start'],inplace=True) jobs_df.reset_index(drop=True,inplace=True) if jobs_df.empty: print Process.print_error('job list is empty, no variants found in region/s specified') return if run_type == 1: n = int(np.ceil(jobs_df.shape[0] / float(cfg['cpus']))) n_remain = int(jobs_df.shape[0] - (n-1) * cfg['cpus']) jobs_df['cpu'] = np.append(np.repeat(range(cfg['cpus'])[:n_remain],n),np.repeat(range(cfg['cpus'])[n_remain:],n-1)).astype(np.int64) + 1 elif run_type == 10: jobs_df['job'] = jobs_df.index.values + 1 elif run_type == 100: n = int(np.ceil(jobs_df.shape[0] / float(cfg['split_n']))) n_remain = int(jobs_df.shape[0] - (n-1) * cfg['split_n']) jobs_df['job'] = np.append(np.repeat(range(cfg['split_n'])[:n_remain],n),np.repeat(range(cfg['split_n'])[n_remain:],n-1)).astype(np.int64) + 1 elif run_type == 11 and args.which != 'snvgroup': cfg['split_n'] = int(np.ceil(jobs_df.shape[0] / float(cfg['cpus']))) n = int(np.ceil(jobs_df.shape[0] / float(cfg['split_n']))) n_remain = int(jobs_df.shape[0] - (n-1) * cfg['split_n']) jobs_df['job'] = np.append(np.repeat(range(cfg['split_n'])[:n_remain],n),np.repeat(range(cfg['split_n'])[n_remain:],n-1)).astype(np.int64) + 1 for i in range(1,int(max(jobs_df['job'])) + 1): n = int(np.ceil(jobs_df[jobs_df['job'] == i].shape[0] / float(cfg['cpus']))) n_remain = int(jobs_df[jobs_df['job'] == i].shape[0] - (n-1) * cfg['cpus']) jobs_df.loc[jobs_df['job'] == i,'cpu'] = np.append(np.repeat(range(cfg['cpus'])[:n_remain],n),np.repeat(range(cfg['cpus'])[n_remain:],n-1)).astype(np.int64) + 1 cfg['split'] = None elif run_type == 101: n = int(np.ceil(jobs_df.shape[0] / float(cfg['split_n']))) n_remain = int(jobs_df.shape[0] - (n-1) * cfg['split_n']) jobs_df['job'] = np.append(np.repeat(range(cfg['split_n'])[:n_remain],n),np.repeat(range(cfg['split_n'])[n_remain:],n-1)).astype(np.int64) + 1 for i in range(1,int(max(jobs_df['job'])) + 1): n = int(np.ceil(jobs_df[jobs_df['job'] == i].shape[0] / float(cfg['cpus']))) n_remain = int(jobs_df[jobs_df['job'] == i].shape[0] - (n-1) * cfg['cpus']) jobs_df.loc[jobs_df['job'] == i,'cpu'] = np.append(np.repeat(range(cfg['cpus'])[:n_remain],n),np.repeat(range(cfg['cpus'])[n_remain:],n-1)).astype(np.int64) + 1 if int(max(jobs_df['job'])) + 1 > 100000: print Process.print_error('number of jobs exceeds 100,000, consider using --split-n to reduce the total number of jobs') return if args.which in ['snv','snvgroup','meta','merge','tools']: print 'detected run type ' + str(run_type) + ' ...' if len(rerun) == 0: if int(max(jobs_df['job'])) > 1 and cfg['qsub'] is not None: if 'mb' in cfg: print ' ' + str(jobs_df.shape[0]) + ' regions of size ' + str(cfg['mb']) + 'mb detected' else: print ' ' + str(jobs_df.shape[0]) + ' regions detected' print ' an array containing ' + str(int(max(jobs_df['job']))) + ' tasks will be submitted' print ' <= ' + str(max(np.bincount(jobs_df['job']))) + ' regions per task' print ' <= ' + str(int(max(jobs_df['cpu']))) + ' cpus per task' print ' qsub options: ' + cfg['qsub'] print ' output directory: ' + cfg['out'] print ' replace: ' + str(cfg['replace']) input_var = None while input_var not in ['y','n','Y','N']: input_var = raw_input('\nsubmit jobs (yY/nN)? ') if input_var.lower() == 'n': print 'canceled by user' return if os.path.exists(cfg['out']): if args.replace: print 'deleting old data' try: shutil.rmtree(cfg['out']) except OSError: print Process.print_error('unable to replace results directory' + cfg['out']) else: print Process.print_error('results directory ' + cfg['out'] + ' already exists, use --replace to overwrite existing results') return try: os.mkdir(cfg['out']) except OSError: pass with open(cfg['out'] + '/' + os.path.basename(cfg['out']) + '.args.pkl', 'wb') as p: pickle.dump([args, cfg], p) if run_type in [10,11,100,101] and jobs_df.shape[0] > 1: print "initializing job array database ..." try: os.mkdir(cfg['out'] + '/temp') except OSError: pass for j in range(1, int(max(jobs_df['job'])) + 1): try: os.mkdir(cfg['out'] + '/jobs' + str(100 * ((j-1) / 100) + 1) + '-' + str(100 * ((j-1) / 100) + 100)) except OSError: pass try: os.mkdir(cfg['out'] + '/jobs' + str(100 * ((j-1) / 100) + 1) + '-' + str(100 * ((j-1) / 100) + 100) + '/job' + str(j)) except OSError: pass with open(cfg['out'] + '/' + cfg['out'] + '.files', 'w') as jlist: for j in range(1, int(max(jobs_df['job'])) + 1): if args.which in ['snv','snvgroup','tools','merge']: if 'model_order' in cfg: for m in cfg['model_order']: if m != '___no_tag___': jlist.write(str(j) + '\t' + cfg['out'] + '.' + m + '.gz' + '\t' + cfg['out'] + '/jobs' + str(100 * ((j-1) / 100) + 1) + '-' + str(100 * ((j-1) / 100) + 100) + '/job' + str(j) + '/' + cfg['out'] + '.job' + str(j) + '.' + m + '.gz\n') else: jlist.write(str(j) + '\t' + cfg['out'] + '.gz' + '\t' + cfg['out'] + '/jobs' + str(100 * ((j-1) / 100) + 1) + '-' + str(100 * ((j-1) / 100) + 100) + '/job' + str(j) + '/' + cfg['out'] + '.job' + str(j) + '.gz\n') else: jlist.write(str(j) + '\t' + cfg['out'] + '.gz' + '\t' + cfg['out'] + '/jobs' + str(100 * ((j-1) / 100) + 1) + '-' + str(100 * ((j-1) / 100) + 100) + '/job' + str(j) + '/' + cfg['out'] + '.job' + str(j) + '.gz\n') if 'meta_order' in cfg: if len(cfg['meta_order']) > 0: for m in cfg['meta_order']: jlist.write(str(j) + '\t' + cfg['out'] + '.' + m + '.gz' + '\t' + cfg['out'] + '/jobs' + str(100 * ((j-1) / 100) + 1) + '-' + str(100 * ((j-1) / 100) + 100) + '/job' + str(j) + '/' + cfg['out'] + '.job' + str(j) + '.' + m + '.gz\n') jobs_df.to_csv(cfg['out'] + '/' + cfg['out'] + '.jobs',header=True,index=False,sep="\t") with open(cfg['out'] + '/' + cfg['out'] + '.jobs.run','w') as f: f.write("\n".join([str(x) for x in jobs_df['job'].unique()])) else: if len(rerun) > 0 and cfg['qsub'] is not None: print 'detected resubmit ...' print ' an array containing ' + str(len(rerun)) + ' tasks will be submitted' print ' <= ' + str(max(np.bincount(jobs_df['job']))) + ' regions per job' print ' <= ' + str(int(max(jobs_df['cpu']))) + ' cpus per job' print ' qsub options: ' + cfg['qsub'] print ' output directory: ' + cfg['out'] print ' replace: ' + str(cfg['replace']) input_var = None while input_var not in ['y','n','Y','N']: input_var = raw_input('\nresubmit jobs (yY/nN)? ') if input_var.lower() == 'n': print 'canceled by user' return with open(cfg['out'] + '/' + cfg['out'] + '.jobs.run','w') as f: f.write("\n".join([str(x) for x in jobs_df['job'][jobs_df['job'].isin(rerun)]])) os.remove(cfg['out'] + '/' + os.path.basename(cfg['out']) + '.rerun') if args.which == 'settings': if 'ordered_args' in args: for k in args.ordered_args: ini.set('main',k[0],k[1]) with open(resource_filename('uga', 'settings.ini'), 'w') as f: ini.write(f) print 'main settings ...' for s in ini.sections(): for k in ini.options(s): print ' ' + k + ' = ' + ini.get(s,k) elif args.which in ['snv','snvgroup','meta','merge','resubmit','tools']: if cfg['qsub']: print "submitting jobs\n" out = cfg['out'] joblist = range(1, int(max(jobs_df['job'])) + 1) if len(rerun) == 0 else rerun if int(max(jobs_df['job'])) > 1: cfg['out'] = out + '/jobsUGA_JOB_RANGE/jobUGA_JOB_ID/' + os.path.basename(out) + '.jobUGA_JOB_ID' cfg['job'] = 'UGA_JOB_ID' if cfg['qsub']: cfg['qsub'] = cfg['qsub'] + ' -t 1-' + str(len(joblist)) else: cfg['out'] = out + '/' + os.path.basename(out) cfg['job'] = 1 if cfg['qsub']: cfg['qsub'] = cfg['qsub'] + ' -t 1' args.ordered_args = [('out',cfg['out']),('region_file',out + '/' + out + '.jobs'),('job',cfg['job']),('cpus',int(max(jobs_df['cpu'])))] + [x for x in args.ordered_args if x[0] not in ['out','region_file','cpus']] cmd = 'Run' + args.which.capitalize() + '(' + str(args.ordered_args) + ')' if cfg['qsub']: Process.qsub(['qsub'] + cfg['qsub'].split() + ['-N',out,'-o',out + '/temp',qsub_wrapper],'\"' + cmd + '\"',out + '/' + out + '.jobs.run',cfg['out'] + '.log') else: Process.interactive(qsub_wrapper, cmd, cfg['out'] + '.' + args.which + '.log') elif args.which == 'compile': files = pd.read_table(args.dir + '/' + os.path.basename(args.dir) + '.files', names=['job','out','file']) complete, rerun = Fxns.verify_results(args.dir,files) if len(rerun) > 0: print Process.print_error('detected ' + str(len(rerun)) + ' failed jobs\n use resubmit module to rerun failed jobs') with open(args.dir + '/' + os.path.basename(args.dir) + '.rerun', 'w') as f: f.write("\n".join([str(x) for x in rerun])) else: complete = Fxns.compile_results(args.dir,files) if complete: input_var = None while input_var not in ['y','n','Y','N']: input_var = raw_input('delete obselete job subdirectories and files for this project (yY/nN)? ') if input_var.lower() == 'n': print 'canceled by user' else: print 'deleting subdirectories' for d in glob.glob(args.dir + '/jobs*-*'): try: shutil.rmtree(d) except OSError: print Process.print_error('unable to delete job data directory ' + d) print 'deleting temporary directory' try: shutil.rmtree(args.dir + '/temp') except OSError: print Process.print_error('unable to delete temporary directory ' + args.dir + '/temp') print "deleting last job run list" try: os.remove(args.dir + '/' + os.path.basename(args.dir) + '.jobs.run') except OSError: print Process.print_error('unable to delete job run list ' + args.dir + '/' + os.path.basename(args.dir) + '.jobs.run') else: print Process.print_error('file compilation incomplete') elif args.which in ['snvgroupplot','snvplot']: cfg['out'] = '.'.join(cfg['file'].split('.')[0:len(cfg['file'].split('.'))-1]) + '.' + args.which args.ordered_args = [('out',cfg['out'])] + [x for x in args.ordered_args if x[0] not in ['out']] cmd = 'Run' + args.which.capitalize() + '(' + str(args.ordered_args) + ')' if cfg['qsub'] is not None: Process.qsub(['qsub'] + cfg['qsub'].split() + ['-o',cfg['out'] + '.log',qsub_wrapper],'\"' + cmd + '\"') else: Process.interactive(qsub_wrapper, cmd, cfg['out'] + '.log') elif args.which == 'filter': if os.path.exists(cfg['file'].replace('.gz','.' + cfg['tag'] + '.log')): if args.replace: try: os.remove(cfg['file'].replace('.gz','.' + cfg['tag'] + '.log')) except OSError: print Process.print_error('unable to remove existing log file ' + cfg['file'].replace('.gz','.' + cfg['tag'] + '.log')) return else: print Process.print_error('log file ' + cfg['file'].replace('.gz','.' + cfg['tag'] + '.log') + ' already exists, use --replace to overwrite existing results') return if os.path.exists(cfg['file'].replace('.gz','.' + cfg['tag'] + '.gz')): if args.replace: try: os.remove(cfg['file'].replace('.gz','.' + cfg['tag'] + '.gz')) except OSError: print Process.print_error('unable to remove existing inflation corrected results file ' + cfg['file'].replace('.gz','.' + cfg['tag'] + '.gz')) else: print Process.print_error('results file ' + cfg['file'].replace('.gz','.' + cfg['tag'] + '.gz') + ' already exists, use --replace to overwrite existing results') return if os.path.exists(cfg['file'].replace('.gz','.' + cfg['tag'] + '.gz.tbi')): if args.replace: try: os.remove(cfg['file'].replace('.gz','.' + cfg['tag'] + '.gz.tbi')) except OSError: print Process.print_error('unable to remove existing inflation corrected results index file ' + cfg['file'].replace('.gz','.' + cfg['tag'] + '.gz.tbi')) else: print Process.print_error('results index file ' + cfg['file'].replace('.gz','.' + cfg['tag'] + '.gz.tbi') + ' already exists, use --replace to overwrite existing results') return cmd = 'Run' + args.which.capitalize() + '(' + str(args.ordered_args) + ')' if cfg['qsub'] is not None: Process.qsub(['qsub'] + cfg['qsub'].split() + ['-o',cfg['file'].replace('.gz','.' + cfg['tag'] + '.log'),qsub_wrapper],'\"' + cmd + '\"') else: Process.interactive(qsub_wrapper, cmd, cfg['file'].replace('.gz','.' + cfg['tag'] + '.log')) else: print Process.print_error(args.which + " not a currently available module") print ''
def proveUniversal(C6, bfactlist, bgoal) : """ Tries to prove a bgoal of form, ["forall", lo, i, hi, bprop_i_] from bfactlist. First, attempts to show hi <= lo, meaning quantification ranges over empty set. If this fails, tries to establish numerical lower and upper bounds for the quantification and enumerate proofs for all elements within these bounds. If this fails, then searches for a bfact in bfactlist that is a forall of the same form, but where its upper bound is hi-1. If success, then tries to prove bprop_hi-1_. If I have the energy, I'll try to make this smarter later.... """ #print "proveUNIVERSAL: bfactlist=", bfactlist #print "goal=", bgoal lo = bgoal[1] hi = bgoal[3] i = bgoal[2] bprop = bgoal[4] # first, see if bgoal in premises: success = bgoal in bfactlist if not success : # next, try to prove that domain is empty, ie, hi <= lo : success = verifyRelation(C6, bfactlist, ["<=", hi, lo]) if not success: # next, try to establish numerical lower and upper bounds and # prove bprop for all elements in the numerical range: lonum = PE.evallToInt(C6, lo) hinum = PE.evallToInt(C6, hi) if isinstance(lonum, int) and isinstance(hinum, int): success = True # so far, so good... for j in range(lonum, hinum): stree = Parse.substituteTree(["int", str(j)], i, bprop) success = success and proveSequent(C6, bfactlist, stree) if not success: # then search bfactlist for a forall goal whose body # matches bprop and whose bounds cover bgoal's all but one: possibles = [ f for f in bfactlist if f[0] == "forall" \ and Parse.substituteTree(i, f[2], f[4]) == bprop \ #and verifyRelation(C6, bfactlist, ["==", f[1], lo]) \ and verifyRelation(C6, bfactlist, ["<=", f[1], lo]) \ and verifyRelation(C6, bfactlist, \ ["==", ["+", f[3], ["int", "1"]], hi]) \ ] if len(possibles) > 0 : success = proveSequent(C6, bfactlist, Parse.substituteTree \ (["-", hi, ["int", "1"]], i, bprop) ) if not success: #search bfactlist for a forall goal whose body # matches bprop and whose bounds cover bgoal's: possibles = [ f for f in bfactlist if f[0] == "forall" \ and Parse.substituteTree(i, f[2], f[4]) == bprop \ and verifyRelation(C6, bfactlist, ["<=", f[1], lo]) \ and verifyRelation(C6, bfactlist, [">=", f[3], hi]) \ ] success = (len(possibles) > 0) return success
import Parse as Parse import csv # Parse data from csv file filename = 'Books/webpages.csv' f = open(filename, 'rU') f.seek(0) fields = ['id', 'author_id', 'publisher_id', 'url', 'publication_series_id', 'title_id', 'award_type_id', 'title_series_id', 'award_category_id'] reader = csv.DictReader(f, dialect='excel-tab', fieldnames=fields) data = [] for row in reader: author_id = Parse.nullize(row['author_id']) publisher_id = Parse.nullize(row['publisher_id']) publication_series_id = Parse.nullize(row['publication_series_id']) title_id = Parse.nullize(row['title_id']) award_type_id = Parse.nullize(row['award_type_id']) title_series_id = Parse.nullize(row['title_series_id']) award_category_id = Parse.nullize(row['award_category_id']) url = Parse.nullize(row['url']) data.append( (row['id'], author_id, publisher_id, publication_series_id, title_id, award_type_id, title_series_id, award_category_id, url) ) # Insert data into Database # db = DB.Database('db4free.net','group8','toto123', 'cs322') #
def RunSnvgroup(args): cfg = Parse.generate_snvgroup_cfg(args) Parse.print_snvgroup_options(cfg) if not cfg['debug']: logging.disable(logging.CRITICAL) regions_df = pd.read_table(cfg['region_file'], compression='gzip' if cfg['region_file'].split('.')[-1] == 'gz' else None) regions_df = regions_df[regions_df['job'] == int(cfg['job'])].reset_index(drop=True) return_values = {} models_out = {} bgzfiles = {} print '' for m in cfg['model_order']: print "initializing out file for model " + m models_out[m] = cfg['out'] if m == '___no_tag___' else cfg['out'] + '.' + m try: bgzfiles[m] = bgzf.BgzfWriter(models_out[m] + '.gz', 'wb') except: print Process.Error("failed to initialize bgzip format out file " + models_out[m] + '.gz').out return 1 if len(cfg['meta_order']) > 0: for m in cfg['meta_order']: print "initializing out file for meta " + m models_out[m] = cfg['out'] + '.' + m try: bgzfiles[m] = bgzf.BgzfWriter(models_out[m] + '.gz', 'wb') except: print Process.Error("failed to initialize bgzip format out file " + models_out[m] + '.gz').out return 1 if cfg['cpus'] > 1: pool = mp.Pool(cfg['cpus']-1) for i in xrange(1,cfg['cpus']): return_values[i] = pool.apply_async(process_regions, args=(regions_df,cfg,i,True,)) print "submitting job on cpu " + str(i) + " of " + str(cfg['cpus']) pool.close() print "executing job for cpu " + str(cfg['cpus']) + " of " + str(cfg['cpus']) + " via main process" main_return = process_regions(regions_df,cfg,cfg['cpus'],True) pool.join() if 1 in [return_values[i].get() for i in return_values] or main_return == 1: print Process.Error("error detected, see log files").out return 1 else: main_return = process_regions(regions_df,cfg,1,True) if main_return == 1: print Process.Error("error detected, see log files").out return 1 for i in xrange(1,cfg['cpus']+1): try: logfile = open(cfg['out'] + '.cpu' + str(i) + '.log', 'r') except: print Process.Error("failed to initialize log file " + cfg['out'] + '.cpu' + str(i) + '.log').out return 1 print logfile.read() logfile.close() os.remove(cfg['out'] + '.cpu' + str(i) + '.log') for m in cfg['model_order']: written = False for i in xrange(1,cfg['cpus']+1): out_model_cpu = '/'.join(cfg['out'].split('/')[0:-1]) + '/' + cfg['out'].split('/')[-1] + '.cpu' + str(i) + '.' + m + '.pkl' pkl = open(out_model_cpu,"rb") results_final,metadata,results_header,tbx_start,tbx_end = pickle.load(pkl) if not written: bgzfiles[m].write(metadata) bgzfiles[m].write("\t".join(results_header) + '\n') written = True if results_final.shape[0] > 0: results_final.replace({'None': 'NA'}).to_csv(bgzfiles[m], index=False, sep='\t', header=False, na_rep='NA', float_format='%.5g', columns = results_header, append=True) pkl.close() os.remove(out_model_cpu) bgzfiles[m].close() print "indexing out file for model " + m if m != '___no_tag___' else "indexing out file" try: pysam.tabix_index(models_out[m] + '.gz',seq_col=0,start_col=tbx_start,end_col=tbx_end,force=True) except: print Process.Error('failed to generate index for file ' + models_out[m] + '.gz').out return 1 if len(cfg['meta_order']) > 0: for m in cfg['meta_order']: written = False for i in xrange(1,cfg['cpus']+1): out_model_meta = '/'.join(cfg['out'].split('/')[0:-1]) + '/' + cfg['out'].split('/')[-1] + '.cpu' + str(i) + '.' + m + '.pkl' pkl = open(out_model_meta,"rb") results_final_meta,metadata,results_header,tbx_start,tbx_end = pickle.load(pkl) if not written: bgzfiles[m].write(metadata) bgzfiles[m].write('\t'.join(results_header) + '\n') written = True if results_final_meta.shape[0] > 0: results_final_meta.replace({'None': 'NA'}).to_csv(bgzfiles[m], index=False, sep='\t', header=False, na_rep='NA', float_format='%.5g', columns = results_header, append=True) pkl.close() os.remove(out_model_meta) bgzfiles[m].close() print "indexing out file for meta " + m try: pysam.tabix_index(models_out[m] + '.gz',seq_col=0,start_col=tbx_start,end_col=tbx_end,force=True) except: print Process.Error('failed to generate index for file ' + models_out[m] + '.gz').out return 1 print "process complete" return 0
################################################################################## import Database as DB import Parse as Parse import csv # Parse data from csv file filename = 'Books/award_categories.csv' f = open(filename, 'rU') f.seek(0) fields = ['id', 'name', 'type_id', 'order', 'note_id'] reader = csv.DictReader(f, dialect='excel-tab', fieldnames=fields) data = [] for row in reader: order = Parse.nullize(row['order']) note_id = Parse.nullize(row['note_id']) data.append( (row['id'], row['name'], row['type_id'], order, note_id) ) # Insert data into Database # db = DB.Database('db4free.net','group8','toto123', 'cs322') # # sql = 'INSERT INTO Awards (id, title, date, type_id, category_id, note_id) VALUES (%s, %s, %s, %s, %s, %s);' # db.insertMany(sql, to_db)
######################################################################## ### Parse the titles.csv file, and import data to the MySQL database ### ######################################################################## import Database as DB import Parse as Parse import csv # Parse data from csv file filename = '../CSV/titles_rem.csv' f = open(filename, 'rU') f.seek(0) fields = ['id', 'title', 'translator', 'synopsis', 'note_id', 'series_id', 'series_nb', 'story_length', 'story_type', 'parent', 'language_id', 'title_graphic'] reader = csv.DictReader(f, dialect='excel-tab', fieldnames=fields) data = [] for row in reader: title_graphic = Parse.booleanize2(row['title_graphic']) data.append( (row['id'], row['title'], row['translator'], row['synopsis'], row['note_id'], row['series_id'], row['series_nb'], row['story_length'], row['story_type'], row['parent'], row['language_id'], title_graphic) ) Parse.writeRows(data, 'titles')
def codeDisasm(self, offset=0, length=0, verbose=0, xref=False): """ Makes the disassembler output. @param offset: start offset in co_code. @param length: length of the substring in co_code. @param verbose: verbosity of the output (0, 1, 2) @param xref: show back references from jumps and such. @return: the disassembler output. """ cb = self.getAllCodeBlocks(offset, length) commands = self.getCommands(offset, length) r = "" for cmd in commands: if xref and cmd.offset in cb.blocks: xstring = cb.strkey(cmd.offset) if xstring != "": r += "\n> xref " + cb.strkey(cmd.offset) + "\n" r += "%.8X " % cmd.offset r += "%.2X " % cmd.opcode if cmd.mnemonics is not None: r += "- " + cmd.mnemonics + " " * (20 - len(cmd.mnemonics)) if cmd.argument is not None: if verbose >= 1: r += "%.4X" % cmd.argument if cmd.mnemonics in ( "LOAD_CONST", "COMPARE_OP", "LOAD_FAST", "STORE_FAST", "DELETE_FAST", "IMPORT_NAME", "IMPORT_FROM", "STORE_GLOBAL", "DELETE_GLOBAL", "LOAD_GLOBAL", "STORE_ATTR", "DELETE_ATTR", "LOAD_ATTR", "STORE_NAME", "DELETE_NAME", "LOAD_NAME", "LOAD_CLOSURE", "LOAD_DEREF", "STORE_DEREF", "JUMP_FORWARD", "JUMP_IF_TRUE", "JUMP_IF_FALSE", "SETUP_FINALLY", "SETUP_EXCEPT", "SETUP_LOOP", "FOR_ITER", "JUMP_ABSOLUTE", ): if verbose >= 1: r += " = " if cmd.mnemonics == "LOAD_CONST": if self.co.consts.value[cmd.argument].__class__.__name__ == "pyCode": r += self.co.consts.value[cmd.argument].info(verbose) else: # r += Parse.shorten(Parse.dropNewLines(self.co.consts.value[cmd.argument].info(verbose))) r += self.co.consts.value[cmd.argument].info(verbose) elif cmd.mnemonics == "COMPARE_OP": r += '"' + Opcodes.cmp_op[cmd.argument] + '"' elif cmd.mnemonics in ("LOAD_FAST", "STORE_FAST", "DELETE_FAST"): r += self.co.varnames.value[cmd.argument].info(verbose) elif cmd.mnemonics in ( "IMPORT_NAME", "IMPORT_FROM", "STORE_GLOBAL", "DELETE_GLOBAL", "LOAD_GLOBAL", "STORE_ATTR", "DELETE_ATTR", "LOAD_ATTR", "STORE_NAME", "DELETE_NAME", "LOAD_NAME", ): r += self.co.names.value[cmd.argument].info(verbose) elif cmd.mnemonics in ("LOAD_CLOSURE", "LOAD_DEREF", "STORE_DEREF"): if cmd.argument < len(self.co.cellvars.value): r += self.co.cellvars.value[cmd.argument].info(verbose) else: r += self.co.freevars.value[cmd.argument - len(self.co.cellvars.value)].info(verbose) elif cmd.mnemonics in ( "JUMP_FORWARD", "JUMP_IF_TRUE", "JUMP_IF_FALSE", "SETUP_FINALLY", "SETUP_EXCEPT", "SETUP_LOOP", "FOR_ITER", ): r += "-> %.8X" % (cmd.offset + cmd.argument + cmd.length) elif cmd.mnemonics == "JUMP_ABSOLUTE": r += "-> %.8X" % cmd.argument else: if verbose == 0: r += "r%.4X" % cmd.argument if verbose >= 2 and len(Opcodes.opcodes[cmd.opcode]) > 2: r += "\n" + Parse.indentText(Parse.narrowText(Opcodes.opcodes[cmd.opcode][2]), 1) r += "\n" return r
def RunTools(args): cfg = Parse.generate_tools_cfg(args) Parse.print_tools_options(cfg) if not cfg['debug']: logging.disable(logging.CRITICAL) regions_df = pd.read_table(cfg['region_file'], compression='gzip' if cfg['region_file'].split('.')[-1] == 'gz' else None) regions_df = regions_df[regions_df['job'] == int(cfg['job'])].reset_index(drop=True) return_values = {} print '' print "initializing out file" try: bgzfile = bgzf.BgzfWriter(cfg['out'] + '.gz', 'wb') except: print Process.Error("failed to initialize bgzip format out file " + cfg['out'] + '.gz').out return 1 if cfg['cpus'] > 1: pool = mp.Pool(cfg['cpus']-1) for i in xrange(1,cfg['cpus']): return_values[i] = pool.apply_async(process_regions, args=(regions_df,cfg,i,True,)) print "submitting job on cpu " + str(i) + " of " + str(cfg['cpus']) pool.close() print "executing job for cpu " + str(cfg['cpus']) + " of " + str(cfg['cpus']) + " via main process" main_return = process_regions(regions_df,cfg,cfg['cpus'],True) pool.join() if 1 in [return_values[i].get() for i in return_values] or main_return == 1: print Process.Error("error detected, see log files").out return 1 else: main_return = process_regions(regions_df,cfg,1,True) if main_return == 1: print Process.Error("error detected, see log files").out return 1 for i in xrange(1,cfg['cpus']+1): try: logfile = open(cfg['out'] + '.cpu' + str(i) + '.log', 'r') except: print Process.Error("failed to initialize log file " + cfg['out'] + '.cpu' + str(i) + '.log').out return 1 print logfile.read() logfile.close() os.remove(cfg['out'] + '.cpu' + str(i) + '.log') written = False for i in xrange(1,cfg['cpus']+1): cpu_regions_df = regions_df[regions_df['cpu'] == i].reset_index() for j in xrange(0,len(cpu_regions_df.index)): f_temp=glob.glob(cfg['out'] + '.cpu' + str(i) + '.chr' + cpu_regions_df['region'][j].replace(':','bp') + '*.gz')[0] try: h=pysam.TabixFile(filename=f_temp,parser=pysam.asVCF()) except: print Process.Error("failed to load vcf file " + f_temp) return 1 if not written: for row in h.header: bgzfile.write(str(row) + '\n') written = True h_iter = h.fetch(region=str(cpu_regions_df['chr'][j])) for row in h_iter: bgzfile.write(str(row) + '\n') for f in glob.glob(cfg['out'] + '.cpu' + str(i) + '.chr' + cpu_regions_df['region'][j].replace(':','bp') + '.*'): os.remove(f) bgzfile.close() print "indexing out file" try: pysam.tabix_index(cfg['out'] + '.gz',preset="vcf",force=True) except: print Process.Error('failed to generate index').out return 1 print "process complete" return 0
def evall(C6, etree): """evaluates etree into PE format, using store and heap in C6 params: C6; etree in usual form, defined in Parse.py returns : resulting PE-value. If etree is malformed or evall gets lost, it returns {} """ ans = {} # case on structure of etree: op = etree[0] if op == "var" : # etree is ["var", vname], where vname is a string(!) store = C6["store"] vname = etree[1] if vname in store : ans = store[vname] else : # invent a dummy name for the var and return it... #newvalue = make(makeSym()) #store[vname] = newvalue #ans = newvalue # {(_cn,):1, ():0} # is a free var if verbose : print "WARNING: evall couldn't find var " + vname + " in the store" elif op == "int" : # etree is ["int", "n"] ans = make(int(etree[1])) elif op == "readInt" : ans = make(makeSym()) # it's an unknown, new input int elif op == "index": # ["index", ["var", v], etree] arrayloc = peToTuple(evall(C6, etree[1])) # get loc of array in heap if arrayloc in C6["heap"] : vector = C6["heap"][arrayloc][1] index = peToTuple(evall(C6, etree[2])) # PE val to tuple rep. if index in vector : # lookup listname[index] ans = vector[index] else : # can't find index in vector, so try to prove index # equal to an existing key: listkeys = vector.keys() indexpe = tupleToPe(index) alias = {} for key in listkeys : keype = tupleToPe(key) found = proveRELATION(C6["rels"], ["==", keype, indexpe]) if found : alias = key break if alias != {} : ans = vector[alias] else : if verbose : print "WARNING: evall could not resolve " + etree[1][1] + str(etree[2]) + " in the store. Will fake it." newvalue = make(makeSym()) vector[index] = newvalue ans = newvalue else : error("scalar or unknown var " + Parse.tostringExpr(etree[1]) + " cannot be indexed") elif op == "len" : # etree is ["len", vtree] arrayloc = peToTuple(evall(C6, etree[1])) # get loc of array in heap if arrayloc in C6["heap"] : ans = C6["heap"][arrayloc][0] else : if verbose : print "WARNING: evall couldn't find array " + etree[1][1] + " in the store" elif op == "list" : # etree is ["list" etreelist ] --- const array newvector = {} elems = [ evall(C6, e) for e in etree[1] ] for i in range(len(elems)) : newvector[peToTuple(make(i))] = elems[i] newloc = make(makeSym()) # for now, we treat locns like ints C6["heap"][peToTuple(newloc)] = (make(len(elems)), newvector) ans = newloc # built a new array elif op == "call" : # etree is ["call", v, etreelist ] # makes a ``skolem constant'' out of the call --- does NOT evall call fname = etree[1] args = [ peToTuple(evall(C6, e)) for e in etree[2] ] # build key, (fname, arg0, arg1, ..., argn), and make a PE value for it: ans = make( (fname,) + tuple(args) ) elif op == "+" : # etree is {"+", e1, e2] pe1 = evall(C6, etree[1]) pe2 = evall(C6, etree[2]) ans = add(pe1, pe2) # places sum of pe1 and pe2 into ans """ SORRY-- can no longer do list addition/append this way: elif isinstance(pe1, tuple) and isinstance(pe2, tuple) : # both lists len1 = pe1[0] # length of first list anslen = add(len1, pe2[0]) # length of combined lists ansmap = {} for k in pe1[1].keys() : ansmap[k] = pe1[1][k] # copying items in pe1 list into ans for k in pe2[1].keys() : newkey = peToTuple(add(len1, tupleToPe(k))) ansmap[newkey] = pe2[1][k] ans = (anslen, ansmap) else : error("adding arrays and ints") """ elif op == "-" : # etree is ["-", e1, e2] pe1 = evall(C6, etree[1]) pe2 = evall(C6, etree[2]) ans = subtract(pe1, pe2) elif op == "*" : # etree is ["*", e1, e2] pe1 = evall(C6, etree[1]) pe2 = evall(C6, etree[2]) ans = mult(pe1, pe2) else : # the expression is not an int-typed expr, and we are lost error("evall cannot evaluate this non-int expr: " + str(etree)) return ans