def handle(argv): util.print_task(util.TASK_ANALYZE_PARAMETERS) absolute_path = util.ABSOLUTE_PATH functions = argv.getlist("function[]") type = argv.get("type") if type: options = {util.ABSOLUTE_PATH_OPTION: absolute_path} options[util.TYPE_OPTION] = argv.get("type") if options[util.TYPE_OPTION] == "vcf": options[util.INPUT_OPTION] = input_validation(argv.get("input"), argv.get("type")) else: options[util.INPUT_OPTION] = input_validation(argv.getlist("input[]"), argv.get("type")) options[util.OUTPUT_OPTION] = output_validation(argv.get("output")) elif functions: options = additional_processing_handler(absolute_path, argv) options[util.ADD_PROCESSING_OPTION] = functions else: options = binding_prediction_handler(absolute_path, argv) util.print_status(util.TASK_SUCCESS) return options
def get_stored_paths(self): cwd = util.replace_home_with_tilde(util.get_cwd()) cwd = path_strip(cwd) oldpwd = util.replace_home_with_tilde(os.environ.get("OLDPWD", cwd)) oldpwd = path_strip(oldpwd) paths = [] with open(self.config["history_file"]) as afile: entries = afile.read().split("\n") check_existence = self.config["check_directory_existence"] # this may take a while - print something # there may be network paths or meta info might be not in the system cache if check_existence: util.print_status("Checking the existence of directories...", truncate=True) for line in entries: path = line.strip() if path in [cwd, oldpwd]: continue if check_existence: exists = os.path.exists(expanduser(path)) else: exists = True paths.append((path_strip(path), exists)) if check_existence: util.remove_status() # cwd always first, prev path in the current shell is always second if available paths.insert(0, (cwd, os.path.exists(expanduser(cwd)))) if cwd != oldpwd: paths.insert(1, (oldpwd, os.path.exists(expanduser(oldpwd)))) return paths
def extract_users (self, calendar_df_iterator): """ given an iterator over calendar dataframes, this constructs and returns a dataframe containing all users """ print_header ("EXTRACTING USERS") #==========[ ITERATE OVER ALL DFS ]========== for cdf in calendar_df_iterator (): print_status ("Extract users", "next df") #=====[ Step 1: sort by user ]===== print_inner_status ("extract_users", "sorting by user id") cdf = cdf.sort ('user') #=====[ Step 2: init user representations ]===== print_inner_status ("extract_users", "initializing user representations") unique_uids = [uid for uid in cdf['user'].unique ()] for uid in unique_uids: if not uid in self.user_representations: self.user_representations[uid] = self.init_user_representation(uid) #=====[ Step 3: update the user representations ]===== print_inner_status ("extract_users", "updating user representations") cdf.apply (self.update_user_representation, axis = 1) #=====[ Step 4: convert to df, delete irrelevant stuff ]===== print_inner_status ("extract_users", "converting to dataframe") self.users_df = pd.DataFrame(self.user_representations.values()) del self.user_representations return self.users_df
def train_semantic_analysis (self): """ PUBLIC: train_semantic_analysis ------------------------------- finds parameters for self.semantic_analysis """ #=====[ Step 1: get the corpus ]===== print_status ("train_semantic_analysis", "getting corpus/dictionary") corpus, dictionary = self.get_corpus_dictionary () #=====[ Step 2: train ]===== print_status ("train_semantic_analysis", "training semantic analysis") self.semantic_analysis.train (corpus, dictionary)
def execute(opts): util.print_task(util.TASK_ALLELE_TYPING) input = opts[util.FASTQ_INPUT_OPTION] output = opts[util.OUTPUT_OPTION] + util.ALLELE_DIRECTORY directory = os.path.dirname(output) if not os.path.exists(directory): os.makedirs(directory) fls = defaultdict(list) for f in input: sample = f.split("/")[-1].split("_")[0] fls[sample].append(f) for i in fls.keys(): cmd = "OptiTypePipeline.py -i " + " ".join( fls[sample]) + " -r -p " + sample + " -o " + output try: time.sleep(5) system(cmd) time.sleep(5) except Exception as e: raise AllelePredictionException(str(e)) a = set() with open(output + sample + "_result.tsv", "r") as r: r.readline() for line in r: line = line.rstrip().split("\t") for i in line[1:7]: a.add( i.replace("A*", "HLA-A*").replace("B*", "HLA-B*").replace( "C*", "HLA-C*")) out = open(output + sample + ".tsv", "w") out.write(("allele") + "\n") out.write("\n".join(a)) out.close() system("rm -f " + output + sample + "_result.tsv") for p, d, files in os.walk(output): for y in files: if not y.endswith(".tsv"): system("rm -f " + os.path.join(p, y)) util.print_status(util.TASK_SUCCESS)
def pull_activities (pull_start, chunk_size): """ Function: pull_activities ------------------------- pulls from index 'start' for chunk_size entries, returns them in a well-formatted dataframe """ print_status ("Pulling activities", "%d to %d" % (pull_start, pull_start + chunk_size)) #=====[ Step 1: get the data ]===== params = {'from':pull_start, 'to':pull_start + chunk_size, 'size':500} request = requests.get(elasticsearch_activities_endpoint, auth=(elasticsearch_username, elasticsearch_password), params=params) #=====[ Step 2: preprocess it ]===== data_df = pre.preprocess_a (request.json ()) return data_df
def mutate(vcf_info, opts): abs_path = opts[util.ABSOLUTE_PATH_OPTION] output = opts[util.OUTPUT_OPTION] log_not_found = [] ################################# RefSeq_human_full.fasta ################################### util.print_task(util.TASK_LOAD_PROTEIN_FILE) refseq_human = read_protein_file(abs_path) util.print_status(util.TASK_SUCCESS) ############################################################################################# ############################# Transcripts_refseq.fasta reading ############################## util.print_task(util.TASK_LOAD_TRANSCRIPT_FILE) refseq_transc, nm_np_conversor = read_transcript_file(abs_path) util.print_status(util.TASK_SUCCESS) ############################################################################################# ################################# vcf info file processing ################################## mutations = defaultdict(list) samples = set() curr_sample = 1 util.print_task(util.TASK_PROCESS_MUTATION) with open(vcf_info, "r") as f: f.readline() for line in f: try: mutation = Mutation(line, nm_np_conversor, refseq_transc, refseq_human) except KeyError: log_not_found.append(line.rstrip()) continue samples.add(mutation.sample) if mutation.mut_protein_sequence: if len(samples) == curr_sample: mutations[mutation.transcript].append(mutation) else: generate_report.mutation(mutations[mutations.keys()[0]][0].sample, mutations, output) mutations = defaultdict(list) mutations[mutation.transcript].append(mutation) curr_sample += 1 generate_report.mutation(mutations[mutations.keys()[0]][0].sample, mutations, output) util.print_status(util.TASK_SUCCESS)
def load (self): """ PUBLIC: load ------------ loads in all parameters """ #=====[ Step 1: load in semantic analysis ]===== print_status ("Initialization", "Loading ML parameters (Begin)") self.semantic_analysis.load () print_status ("Initialization", "Loading ML parameters (End)") #=====[ Step 2: transfer over models to inference ]===== print_status ("Initialization", "Constructing Inference instance (Begin)") self.inference = Inference (self.semantic_analysis.lda_model, self.semantic_analysis.lda_model_topics) print_status ("Initialization", "Constructing Inference instance (End)")
def get_corpus_dictionary (self): """ PRIVATE: get_corpus_dictionary ------------------------------ Assembles a gensim corpus and dictionary from activities_df, where each text is name || words. """ #=====[ Step 1: iterate through all activity dataframes ]===== print_status ("get_corpus", "assembling texts") texts = [] for df in self.storage_delegate.iter_activity_dfs (): print_inner_status ("assembling texts", "next df") texts += list(df.apply(self.extract_text, axis=1)) #=====[ Step 3: get dictionary ]===== print_status ("get_corpus", "assembling dictionary") dictionary = gensim.corpora.Dictionary(texts) #=====[ Step 4: get corpus ]===== print_status ("get_corpus", "assembling corpus") corpus = [dictionary.doc2bow (text) for text in texts] return corpus, dictionary
cv2.namedWindow ('DISPLAY') cv.SetMouseCallback ('DISPLAY', on_mouse, param=harris_corners) #==========[ Step 4: have user mark keypoints ]========== while True: disp_image = cv2.drawKeypoints (image, harris_corners, color=(0, 0, 255)) disp_image = cv2.drawKeypoints (disp_image, corner_keypoints, color=(255, 0, 0)) cv2.imshow ('DISPLAY', disp_image) key = cv2.waitKey(30) if key == 27: break #==========[ Step 5: get descriptors for each corner point ]========== print_status ('MarkImage', 'getting SIFT descriptors for clicked corners') desc = CVAnalysis.get_sift_descriptors (image, corner_keypoints) corner_sift_desc = desc #==========[ Step 6: construct BoardImage ]========== print_status ('MarkImage', 'constructing BoardImage object') board = Board ( image=image, name=image_name, board_points=corner_board_points, image_points=corner_image_points, sift_desc = corner_sift_desc )
def preprocess_ce(self, ce): """ PUBLIC: preprocess_ce --------------------- given an object representing calendar events (either json or pandas dataframe), this will return a correctly-formatted version """ #=====[ Step 1: ce -> dataframe representation ]===== df = self.get_dataframe_rep (ce) #=====[ Step 2: apply formatting operations ]===== print_status ("preprocess_ce", "dropping unnecessary columns") df = self.retain_columns (df, ce_retain_cols) print_status ("preprocess_ce", "reformatting location") df = self.reformat_location (df) print_status ("preprocess_ce", "filtering by location") df = self.filter_location (df) print_status ("preprocess_ce", "reformatting dates") df = self.reformat_date (df) print_status ("preprocess_ce", "reformatting name") df = self.reformat_name (df) print_status ("preprocess_ce", "reformatting description") df = self.reformat_description (df) return df
def execute(opts): util.print_task(util.TASK_PREDICT_BINDING) abs_path = opts[util.ABSOLUTE_PATH_OPTION] pep_len = opts[util.LENGTH_OPTION] method = opts[util.METHOD_OPTION] hlas = opts[util.ALLELE_OPTION] parallel = opts[util.PARALLEL_OPTION] p_class = opts[util.CLASS_OPTION] input_path = opts[util.OUTPUT_OPTION] mutations_path = input_path + util.MUTATION_DIRECTORY raw_predictions_path = input_path + "/c" + str( p_class) + "_" + util.PREDICTION_RAW_DIRECTORY directory = path.dirname(raw_predictions_path) if not path.exists(directory): makedirs(directory) files = [ f for f in listdir(mutations_path) if path.isfile(path.join(mutations_path, f)) ] if not files: raise NoMutatedFileWasFoundException cmds = [] for f in files: if f.startswith("."): continue # Teste Total if method == "mhcflurry": with open(mutations_path + f, "r") as st: line_fasta = [] for line in st: if line.startswith(">"): text = line.replace(".", "_") else: text = line line_fasta.append(text) pfasta_file = mutations_path + f pfasta_out = open(pfasta_file, "w") pfasta_out.write("".join(line_fasta)) pfasta_out.close() output_file = raw_predictions_path + f for i in pep_len: pl = [str(i)] * len(hlas) if p_class == 1: if method == "mhcflurry": cmds.append( "mhctools --mhc-predictor mhcflurry --input-fasta-file " + mutations_path + f + " --extract-subsequences --mhc-alleles " + ",".join(hlas) + " --mhc-peptide-lengths " + ",".join(pl) + " --output-csv " + output_file) else: cmds.append("python " + util.PREDICTION_CLASS_COMMAND[p_class] + method.split("iedb_")[1] + " \"" + ",".join(hlas) + \ "\" " + ",".join(pl) + " " + mutations_path + f + " >> " + output_file) else: cmds.append("python " + util.PREDICTION_CLASS_COMMAND[p_class] + method.split("iedb_")[1] + " " + ",".join(hlas) + \ " " + mutations_path + f + " >> " + output_file) if parallel: p_file = "temp_par_cmd.txt" p_out = open(p_file, "w") p_out.write("\n".join(cmds)) p_out.close() cmd = "parallel --no-notice -j " + str(parallel) + " <" + p_file try: system(cmd) except Exception as e: raise BindingPredictionException(str(e)) remove(p_file) else: for cmd in cmds: try: system(cmd) except Exception as e: raise BindingPredictionException(str(e)) util.print_status(util.TASK_SUCCESS)
def filter(opts): util.print_task(util.TASK_FILTER_BINDING) method = opts[util.METHOD_OPTION] input_path = opts[util.OUTPUT_OPTION] p_class = opts[util.CLASS_OPTION] mutations_path = input_path + util.MUTATION_DIRECTORY nf_predictions_path = input_path + "/c" + str( p_class) + "_" + util.PREDICTION_NOT_FILTERED_DIRECTORY raw_predictions_path = input_path + "/c" + str( p_class) + "_" + util.PREDICTION_RAW_DIRECTORY directory = path.dirname(nf_predictions_path) if not path.exists(directory): makedirs(directory) id_to_gene = defaultdict(dict) files = [ f for f in listdir(raw_predictions_path) if path.isfile(path.join(raw_predictions_path, f)) ] if not files: raise NoBindingPredictionFileWasFoundException for f in files: header_to_id = {} filtered_results = set() not_filtered_results = set() if f.startswith("."): continue with open(raw_predictions_path + f, "r") as st: prediction_header = st.readline() for line in st: if method == "mhcflurry": if line.startswith("source_sequence_name"): continue lsplit = line.rstrip().split(",") lsplit2 = lsplit[0].rstrip().split("|") p_key = lsplit2[-1] else: if line.startswith("allele"): continue lsplit = line.rstrip().split("\t") p_key = lsplit[1] if len(lsplit) < 5: continue if method == "mhcflurry": s_key = "|".join([lsplit[3], lsplit[1], lsplit[-1]]) elif p_class == 1: s_key = "|".join([lsplit[0], lsplit[2], lsplit[4]]) else: s_key = "|".join([lsplit[0], lsplit[2]]) id_to_gene[p_key][s_key] = line.rstrip() with open(mutations_path + f, "r") as st: for line in st: if line.startswith(">"): k = "|".join(line.rstrip().split("|")[0:-1]) try: header_to_id[k].append( int(line.rstrip().split("|")[-1])) except: header_to_id[k] = [] header_to_id[k].append( int(line.rstrip().split("|")[-1])) for key in header_to_id.keys(): sample, nm, np, annotation, gene, hgvs_c, hgvs_p, variant, genotype = key.split( "|") if len(header_to_id[key]) % 2 == 0: for i in range(0, len(header_to_id[key]), 2): hti = header_to_id[key][i:i + 2] ref_id = min(hti) alt_id = max(hti) ref = id_to_gene[str(ref_id)] alt = id_to_gene[str(alt_id)] for prediction in alt.keys(): if prediction not in ref.keys( ) or prediction not in alt.keys(): continue if method == "mhcflurry": ref_prediction = ref[prediction].split(",") alt_prediction = alt[prediction].split(",") ic50 = 4 else: ref_prediction = ref[prediction].split("\t") alt_prediction = alt[prediction].split("\t") ic50 = 6 if float(ref_prediction[ic50]) < 500: continue if p_class == 1: if ref_prediction[5] == alt_prediction[5]: continue else: if ref_prediction[4] == alt_prediction[4]: continue if float(alt_prediction[ic50]) < 50: rank = "STRONG BINDER" elif float(alt_prediction[ic50]) >= 50 and float( alt_prediction[ic50]) < 250: rank = "INTERMEDIATE BINDER" elif float(alt_prediction[ic50]) >= 250 and float( alt_prediction[ic50]) < 500: rank = "WEAK BINDER" else: rank = "NON BINDER" dai = float(ref_prediction[ic50]) - float( alt_prediction[ic50]) if method == "mhcflurry": merged_report = "\t".join([ sample[1:], gene, variant, genotype, hgvs_c, hgvs_p, nm, np, annotation, alt_prediction[3], alt_prediction[-1], ref_prediction[2] ]) merged_report += "\t" + "\t".join([ ref_prediction[ic50], alt_prediction[2], alt_prediction[ic50], str(dai), rank ]) elif p_class == 1: merged_report = "\t".join([ sample[1:], gene, variant, genotype, hgvs_c, hgvs_p, nm, np, annotation, alt_prediction[0], alt_prediction[4], ref_prediction[5] ]) merged_report += "\t" + "\t".join([ ref_prediction[ic50], alt_prediction[5], alt_prediction[ic50], str(dai), rank ]) else: merged_report = "\t".join([ sample[1:], gene, variant, genotype, hgvs_c, hgvs_p, nm, np, annotation, alt_prediction[0], ref_prediction[4] ]) merged_report += "\t" + "\t".join([ ref_prediction[ic50], alt_prediction[4], alt_prediction[ic50], str(dai), rank ]) not_filtered_results.add(merged_report) else: alt_id = max(header_to_id[key]) alt = id_to_gene[str(alt_id)] for prediction in alt.keys(): alt_prediction = alt[prediction].split("\t") if method == "mhcflurry": merged_report = "\t".join([ sample[1:], gene, variant, genotype, hgvs_c, hgvs_p, nm, np, annotation, alt_prediction[3], alt_prediction[-1], "NA" ]) else: merged_report = "\t".join([ sample[1:], gene, variant, genotype, hgvs_c, hgvs_p, nm, np, annotation, alt_prediction[0], alt_prediction[4], "NA" ]) ic50 = 6 if float(alt_prediction[ic50]) < 50: rank = "STRONG BINDER" elif float(alt_prediction[ic50]) >= 50 and float( alt_prediction[ic50]) < 250: rank = "INTERMEDIATE BINDER" elif float(alt_prediction[ic50]) >= 250 and float( alt_prediction[ic50]) < 500: rank = "WEAK BINDER" else: rank = "NON BINDER" dai = "NA" merged_report += "\t" + "\t".join([ "NA", alt_prediction[5], alt_prediction[ic50], str(dai), rank ]) not_filtered_results.add(merged_report) if p_class == 1: nf_header = "\t".join([ "sample", "gene", "variant", "genotype", "hgvs_c", "hgvs_p", "nm", "np", "annotation", "allele", "len", "ref_peptide", "ref_ic50", "alt_peptide", "alt_ic50", "dai", "classification" ]) else: nf_header = "\t".join([ "sample", "gene", "variant", "genotype", "hgvs_c", "hgvs_p", "nm", "np", "annotation", "allele", "ref_peptide", "ref_ic50", "alt_peptide", "alt_ic50", "dai", "classification" ]) if not_filtered_results: out = open(nf_predictions_path + f, "w") out.write(nf_header + "\n") out.write("\n".join(not_filtered_results)) out.close() return True else: return False system("rm -rf " + raw_predictions_path) util.print_status(util.TASK_SUCCESS)
#==========[ Step 4: draw image ]========== cv2.namedWindow('DISPLAY') cv.SetMouseCallback('DISPLAY', on_mouse, param=harris_corners) #==========[ Step 4: have user mark keypoints ]========== while True: disp_image = cv2.drawKeypoints(image, harris_corners, color=(0, 0, 255)) disp_image = cv2.drawKeypoints(disp_image, corner_keypoints, color=(255, 0, 0)) cv2.imshow('DISPLAY', disp_image) key = cv2.waitKey(30) if key == 27: break #==========[ Step 5: get descriptors for each corner point ]========== print_status('MarkImage', 'getting SIFT descriptors for clicked corners') desc = CVAnalysis.get_sift_descriptors(image, corner_keypoints) corner_sift_desc = desc #==========[ Step 6: construct BoardImage ]========== print_status('MarkImage', 'constructing BoardImage object') board = Board(image=image, name=image_name, board_points=corner_board_points, image_points=corner_image_points, sift_desc=corner_sift_desc)
def __init__(self, msg): util.print_status(util.TASK_ERROR) super(MException, self).__init__(msg)
def execute(opts): input = opts[util.FASTQ_INPUT_OPTION] output = opts[util.OUTPUT_OPTION] + util.GENE_EXPRESSION index = util.HUMAN_TRANSCRIPTS_INDEX directory = os.path.dirname(output) if not os.path.exists(directory): os.makedirs(directory) util.print_task(util.TASK_GENE_EXPRESSION) ids = defaultdict(list) for f in input: sample = f.split("/")[-1].split("_")[0] ids[sample].append(f) for sample in ids.keys(): if ids[sample][0].find("read") >= 0: end = " --single -l " + str(200) + " -s " + str(20) + " " else: end = " " files = " ".join(ids[sample]) cmd = "kallisto quant -i " + index + " -o " + output + end + files try: system(cmd) except Exception as e: raise QuantifyingExpressionException(str(e)) mg = mygene.MyGeneInfo() symbols = pd.read_csv(output + "abundance.tsv", sep='\t', header=0, usecols=['target_id', 'tpm']) symbols['target_id'] = symbols['target_id'].str.split('.').str[0] sy = mg.querymany(symbols['target_id'], scopes='all', fields='symbol', species='human', verbose=False, as_dataframe=True) df = pd.merge(symbols, sy, left_on="target_id", right_on="query").drop( columns=['_id', '_score', 'notfound']).drop_duplicates() df2 = df.groupby([ 'symbol', 'target_id' ])['tpm'].sum().reset_index(name='tpm').drop_duplicates() df2.columns = ['symbol', 'transcript', 'tpm'] df2['tpm'] = pd.to_numeric(df2['tpm']) df2[df2['tpm'] > 1].to_csv(output + sample + ".tsv", sep='\t', index=False) os.remove(output + "abundance.tsv") os.remove(output + "abundance.h5") os.remove(output + "run_info.json") util.print_status(util.TASK_SUCCESS)
def extract(vcf_info): util.print_task(util.TASK_EXTRACT_VCF_INFO) mg = mygene.MyGeneInfo() out_file = vcf_info + ".ExtractedInfo.txt" out_string = [] if os.path.isdir(vcf_info): files = [ path.join(vcf_info, f) for f in listdir(vcf_info) if path.isfile(path.join(vcf_info, f)) ] else: files = [] files.append(vcf_info) for vcf in files: if vcf.endswith(".annotated"): with open(vcf, "r") as f: samples = {} line_count = 1 for line in f: try: if line.startswith("##"): line_count += 1 continue linesplit = line.rstrip().split("\t") if line.startswith("#"): for i in range(9, len(linesplit)): samples[i] = linesplit[i] else: try: infos = linesplit[7].split(",") for i in infos: mut = i.split("ANN=")[-1] infosplit = mut.split("|") for key in samples.keys(): if linesplit[key].split(":")[0].find( "1") >= 0: out_string.append("\t".join([ samples[key], infosplit[1], infosplit[3], infosplit[6], infosplit[9], infosplit[10], linesplit[2], linesplit[key].split(":")[0] ])) except: continue except Exception as e: util.print_status(util.TASK_ERROR) msg = util.REPORT + str(e) + "\n\tline: " + str( line_count) + " | \"" + line.rstrip() + "\"\n" raise VCFWrongFormat(msg) line_count += 1 sorted_out_string = sorted(out_string) df = pd.DataFrame([sub.split("\t") for sub in sorted_out_string], columns=[ "Sample", "Annotation", "Gene", "Transcript", "HGVS.c", "HGVS.p", "Variant", "Genotype" ]).drop_duplicates() df = df[df['Transcript'].str.contains('NM')] df['Transcript'] = df['Transcript'].str.split('.').str[0] df.to_csv(out_file, sep='\t', index=False) util.print_status(util.TASK_SUCCESS) return out_file
#=====[ our modules ]===== from BoardImage import BoardImage from CVAnalyzer import CVAnalyzer from Board import Board from util import print_welcome, print_message, print_status #=====[ globals ]===== board_image_dir = '../data/marked' if __name__ == "__main__": print_welcome () #==========[ Step 1: get board_image ]========== print_status ("Main", "loading board image") bi_filename = os.path.join (board_image_dir, 'micah1.bi') # bi_filename = os.path.join (board_image_dir, 'above.bi') board_image = BoardImage (filename=bi_filename) #==========[ Step 2: construct cv_analyzer, get BIH ]========== print_status ("Main", "creating cv_analyzer") cv_analyzer = CVAnalyzer () print_status ("Main", "finding BIH (board-image homography)") BIH = cv_analyzer.find_board_image_homography (board_image) #==========[ Step 3: construct the board ]========== print_status ("Main", "constructing the board") board = Board (BIH) #==========[ Step 4: draw squares on image ]==========