Example #1
0
def handle(argv):

    util.print_task(util.TASK_ANALYZE_PARAMETERS)

    absolute_path = util.ABSOLUTE_PATH

    functions = argv.getlist("function[]")
    type = argv.get("type")

    if type:
        options = {util.ABSOLUTE_PATH_OPTION: absolute_path}

        options[util.TYPE_OPTION] = argv.get("type")
        
        if options[util.TYPE_OPTION] == "vcf":
            options[util.INPUT_OPTION] = input_validation(argv.get("input"), argv.get("type"))
        else:
            options[util.INPUT_OPTION] = input_validation(argv.getlist("input[]"), argv.get("type"))

        options[util.OUTPUT_OPTION] = output_validation(argv.get("output"))

    elif functions:
        options = additional_processing_handler(absolute_path, argv)
        options[util.ADD_PROCESSING_OPTION] = functions
    else:
        options = binding_prediction_handler(absolute_path, argv)

    util.print_status(util.TASK_SUCCESS)

    return options
Example #2
0
    def get_stored_paths(self):
        cwd = util.replace_home_with_tilde(util.get_cwd())
        cwd = path_strip(cwd)
        oldpwd = util.replace_home_with_tilde(os.environ.get("OLDPWD", cwd))
        oldpwd = path_strip(oldpwd)
        paths = []

        with open(self.config["history_file"]) as afile:
            entries = afile.read().split("\n")

        check_existence = self.config["check_directory_existence"]
        # this may take a while - print something
        # there may be network paths or meta info might be not in the system cache
        if check_existence:
            util.print_status("Checking the existence of directories...", truncate=True)

        for line in entries:
            path = line.strip()
            if path in [cwd, oldpwd]:
                continue
            if check_existence:
                exists = os.path.exists(expanduser(path))
            else:
                exists = True
            paths.append((path_strip(path), exists))

        if check_existence:
            util.remove_status()

        # cwd always first, prev path in the current shell is always second if available
        paths.insert(0, (cwd, os.path.exists(expanduser(cwd))))
        if cwd != oldpwd:
            paths.insert(1, (oldpwd, os.path.exists(expanduser(oldpwd))))
        return paths
Example #3
0
	def extract_users (self, calendar_df_iterator):
		"""
			given an iterator over calendar dataframes,
			this constructs and returns a dataframe 
			containing all users
		"""
		print_header ("EXTRACTING USERS")
		#==========[ ITERATE OVER ALL DFS	]==========
		for cdf in calendar_df_iterator ():
			print_status ("Extract users", "next df")

			#=====[ Step 1: sort by user	]=====
			print_inner_status ("extract_users", "sorting by user id")
			cdf = cdf.sort ('user')

			#=====[ Step 2: init user representations	]=====
			print_inner_status ("extract_users", "initializing user representations")
			unique_uids = [uid for uid in cdf['user'].unique ()]
			for uid in unique_uids:
				if not uid in self.user_representations:
					self.user_representations[uid] = self.init_user_representation(uid)

			#=====[ Step 3: update the user representations	]=====
			print_inner_status ("extract_users", "updating user representations")			
			cdf.apply (self.update_user_representation, axis = 1)

		#=====[ Step 4: convert to df, delete irrelevant stuff	]=====
		print_inner_status ("extract_users", "converting to dataframe")		
		self.users_df = pd.DataFrame(self.user_representations.values())
		del self.user_representations
		return self.users_df
Example #4
0
	def train_semantic_analysis (self):
		"""
			PUBLIC: train_semantic_analysis
			-------------------------------
			finds parameters for self.semantic_analysis
		"""
		#=====[ Step 1: get the corpus	]=====
		print_status ("train_semantic_analysis", "getting corpus/dictionary")
		corpus, dictionary = self.get_corpus_dictionary ()

		#=====[ Step 2: train ]=====
		print_status ("train_semantic_analysis", "training semantic analysis")
		self.semantic_analysis.train (corpus, dictionary)
Example #5
0
def execute(opts):

    util.print_task(util.TASK_ALLELE_TYPING)

    input = opts[util.FASTQ_INPUT_OPTION]
    output = opts[util.OUTPUT_OPTION] + util.ALLELE_DIRECTORY

    directory = os.path.dirname(output)
    if not os.path.exists(directory):
        os.makedirs(directory)

    fls = defaultdict(list)

    for f in input:
        sample = f.split("/")[-1].split("_")[0]
        fls[sample].append(f)

    for i in fls.keys():
        cmd = "OptiTypePipeline.py -i " + " ".join(
            fls[sample]) + " -r -p " + sample + " -o " + output
        try:
            time.sleep(5)
            system(cmd)
            time.sleep(5)
        except Exception as e:
            raise AllelePredictionException(str(e))
    a = set()

    with open(output + sample + "_result.tsv", "r") as r:
        r.readline()
        for line in r:
            line = line.rstrip().split("\t")
            for i in line[1:7]:
                a.add(
                    i.replace("A*", "HLA-A*").replace("B*", "HLA-B*").replace(
                        "C*", "HLA-C*"))

    out = open(output + sample + ".tsv", "w")
    out.write(("allele") + "\n")
    out.write("\n".join(a))
    out.close()

    system("rm -f " + output + sample + "_result.tsv")

    for p, d, files in os.walk(output):
        for y in files:
            if not y.endswith(".tsv"):
                system("rm -f " + os.path.join(p, y))

    util.print_status(util.TASK_SUCCESS)
Example #6
0
def pull_activities (pull_start, chunk_size):
	"""
		Function: pull_activities
		-------------------------
		pulls from index 'start' for chunk_size entries,
		returns them in a well-formatted dataframe
	"""
	print_status ("Pulling activities", "%d to %d" % (pull_start, pull_start + chunk_size))

	#=====[ Step 1: get the data	]=====
	params = {'from':pull_start, 'to':pull_start + chunk_size, 'size':500}
	request = requests.get(elasticsearch_activities_endpoint, auth=(elasticsearch_username, elasticsearch_password), params=params)

	#=====[ Step 2: preprocess it	]=====
	data_df = pre.preprocess_a (request.json ())
	return data_df
def mutate(vcf_info, opts):

    abs_path = opts[util.ABSOLUTE_PATH_OPTION]
    output = opts[util.OUTPUT_OPTION]

    log_not_found = []
    ################################# RefSeq_human_full.fasta ###################################

    util.print_task(util.TASK_LOAD_PROTEIN_FILE)

    refseq_human = read_protein_file(abs_path)

    util.print_status(util.TASK_SUCCESS)

    #############################################################################################

    ############################# Transcripts_refseq.fasta reading ##############################

    util.print_task(util.TASK_LOAD_TRANSCRIPT_FILE)

    refseq_transc, nm_np_conversor = read_transcript_file(abs_path)

    util.print_status(util.TASK_SUCCESS)

    #############################################################################################

    ################################# vcf info file processing ##################################

    mutations = defaultdict(list)
    samples = set()
    curr_sample = 1

    util.print_task(util.TASK_PROCESS_MUTATION)

    with open(vcf_info, "r") as f:

        f.readline()

        for line in f:

            try:
                mutation = Mutation(line, nm_np_conversor, refseq_transc, refseq_human)
            except KeyError:
                log_not_found.append(line.rstrip())
                continue
            
            samples.add(mutation.sample)

            if mutation.mut_protein_sequence:
                if len(samples) == curr_sample:
                    mutations[mutation.transcript].append(mutation)
                else:
                    generate_report.mutation(mutations[mutations.keys()[0]][0].sample, mutations, output)
                    mutations = defaultdict(list)
                    mutations[mutation.transcript].append(mutation)
                    curr_sample += 1
                
    generate_report.mutation(mutations[mutations.keys()[0]][0].sample, mutations, output)

    util.print_status(util.TASK_SUCCESS)
Example #8
0
	def load (self):
		"""
			PUBLIC: load 
			------------
			loads in all parameters 
		"""
		#=====[ Step 1: load in semantic analysis	]=====
		print_status ("Initialization", "Loading ML parameters (Begin)")
		self.semantic_analysis.load ()
		print_status ("Initialization", "Loading ML parameters (End)")		

		#=====[ Step 2: transfer over models to inference	]=====
		print_status ("Initialization", "Constructing Inference instance (Begin)")
		self.inference = Inference (self.semantic_analysis.lda_model, self.semantic_analysis.lda_model_topics)
		print_status ("Initialization", "Constructing Inference instance (End)")
Example #9
0
	def get_corpus_dictionary (self):
		"""
			PRIVATE: get_corpus_dictionary
			------------------------------
			Assembles a gensim corpus and dictionary from activities_df,
			where each text is name || words.
		"""
		#=====[ Step 1: iterate through all activity dataframes	]=====
		print_status ("get_corpus", "assembling texts")
		texts = []
		for df in self.storage_delegate.iter_activity_dfs ():
			print_inner_status ("assembling texts", "next df")
			texts += list(df.apply(self.extract_text, axis=1))

		#=====[ Step 3: get dictionary	]=====
		print_status ("get_corpus", "assembling dictionary")
		dictionary = gensim.corpora.Dictionary(texts)

		#=====[ Step 4: get corpus	]=====
		print_status ("get_corpus", "assembling corpus")		
		corpus = [dictionary.doc2bow (text) for text in texts]

		return corpus, dictionary
Example #10
0
	cv2.namedWindow ('DISPLAY')
	cv.SetMouseCallback ('DISPLAY', on_mouse, param=harris_corners)


	#==========[ Step 4: have user mark keypoints ]==========
	while True:

		disp_image = cv2.drawKeypoints (image, harris_corners, color=(0, 0, 255))
		disp_image = cv2.drawKeypoints (disp_image, corner_keypoints, color=(255, 0, 0))

		cv2.imshow ('DISPLAY', disp_image)
		key = cv2.waitKey(30)
		if key == 27:
			break

	
	#==========[ Step 5: get descriptors for each corner point	]==========
	print_status ('MarkImage', 'getting SIFT descriptors for clicked corners')
	desc = CVAnalysis.get_sift_descriptors (image, corner_keypoints)
	corner_sift_desc = desc


	#==========[ Step 6: construct BoardImage	]==========
	print_status ('MarkImage', 'constructing BoardImage object')	
	board = Board 	(	
						image=image, 
						name=image_name, 
						board_points=corner_board_points, 
						image_points=corner_image_points,
						sift_desc = corner_sift_desc
					)
Example #11
0
	def preprocess_ce(self, ce):
		"""
			PUBLIC: preprocess_ce
			---------------------
			given an object representing calendar events (either json or
			pandas dataframe), this will return a correctly-formatted version
		"""
		#=====[ Step 1: ce -> dataframe representation	]=====
		df = self.get_dataframe_rep (ce)

		#=====[ Step 2: apply formatting operations	]=====
		print_status ("preprocess_ce", "dropping unnecessary columns")
		df = self.retain_columns (df, ce_retain_cols)

		print_status ("preprocess_ce", "reformatting location")
		df = self.reformat_location (df)

		print_status ("preprocess_ce", "filtering by location")
		df = self.filter_location (df)

		print_status ("preprocess_ce", "reformatting dates")
		df = self.reformat_date (df)

		print_status ("preprocess_ce", "reformatting name")
		df = self.reformat_name (df)

		print_status ("preprocess_ce", "reformatting description")
		df = self.reformat_description (df)

		return df
Example #12
0
def execute(opts):

    util.print_task(util.TASK_PREDICT_BINDING)

    abs_path = opts[util.ABSOLUTE_PATH_OPTION]
    pep_len = opts[util.LENGTH_OPTION]
    method = opts[util.METHOD_OPTION]
    hlas = opts[util.ALLELE_OPTION]
    parallel = opts[util.PARALLEL_OPTION]
    p_class = opts[util.CLASS_OPTION]
    input_path = opts[util.OUTPUT_OPTION]

    mutations_path = input_path + util.MUTATION_DIRECTORY
    raw_predictions_path = input_path + "/c" + str(
        p_class) + "_" + util.PREDICTION_RAW_DIRECTORY

    directory = path.dirname(raw_predictions_path)
    if not path.exists(directory):
        makedirs(directory)

    files = [
        f for f in listdir(mutations_path)
        if path.isfile(path.join(mutations_path, f))
    ]

    if not files:
        raise NoMutatedFileWasFoundException

    cmds = []

    for f in files:

        if f.startswith("."):
            continue

# Teste Total
        if method == "mhcflurry":
            with open(mutations_path + f, "r") as st:
                line_fasta = []
                for line in st:
                    if line.startswith(">"):
                        text = line.replace(".", "_")
                    else:
                        text = line
                    line_fasta.append(text)
                pfasta_file = mutations_path + f
                pfasta_out = open(pfasta_file, "w")
                pfasta_out.write("".join(line_fasta))
                pfasta_out.close()

        output_file = raw_predictions_path + f

        for i in pep_len:
            pl = [str(i)] * len(hlas)
            if p_class == 1:
                if method == "mhcflurry":
                    cmds.append(
                        "mhctools --mhc-predictor mhcflurry --input-fasta-file "
                        + mutations_path + f +
                        " --extract-subsequences --mhc-alleles " +
                        ",".join(hlas) + " --mhc-peptide-lengths " +
                        ",".join(pl) + " --output-csv " + output_file)
                else:
                    cmds.append("python " + util.PREDICTION_CLASS_COMMAND[p_class] + method.split("iedb_")[1] + " \"" + ",".join(hlas) + \
                                "\" " + ",".join(pl) + " " + mutations_path + f + " >> " + output_file)
            else:
                cmds.append("python " + util.PREDICTION_CLASS_COMMAND[p_class] + method.split("iedb_")[1] + " " + ",".join(hlas) + \
                            " " + mutations_path + f + " >> " + output_file)

    if parallel:
        p_file = "temp_par_cmd.txt"
        p_out = open(p_file, "w")
        p_out.write("\n".join(cmds))
        p_out.close()

        cmd = "parallel --no-notice -j " + str(parallel) + " <" + p_file

        try:
            system(cmd)
        except Exception as e:
            raise BindingPredictionException(str(e))
        remove(p_file)

    else:
        for cmd in cmds:
            try:
                system(cmd)
            except Exception as e:
                raise BindingPredictionException(str(e))
    util.print_status(util.TASK_SUCCESS)
Example #13
0
def filter(opts):

    util.print_task(util.TASK_FILTER_BINDING)

    method = opts[util.METHOD_OPTION]
    input_path = opts[util.OUTPUT_OPTION]
    p_class = opts[util.CLASS_OPTION]

    mutations_path = input_path + util.MUTATION_DIRECTORY
    nf_predictions_path = input_path + "/c" + str(
        p_class) + "_" + util.PREDICTION_NOT_FILTERED_DIRECTORY
    raw_predictions_path = input_path + "/c" + str(
        p_class) + "_" + util.PREDICTION_RAW_DIRECTORY

    directory = path.dirname(nf_predictions_path)
    if not path.exists(directory):
        makedirs(directory)

    id_to_gene = defaultdict(dict)
    files = [
        f for f in listdir(raw_predictions_path)
        if path.isfile(path.join(raw_predictions_path, f))
    ]

    if not files:
        raise NoBindingPredictionFileWasFoundException

    for f in files:
        header_to_id = {}
        filtered_results = set()
        not_filtered_results = set()

        if f.startswith("."):
            continue

        with open(raw_predictions_path + f, "r") as st:
            prediction_header = st.readline()
            for line in st:
                if method == "mhcflurry":
                    if line.startswith("source_sequence_name"):
                        continue
                    lsplit = line.rstrip().split(",")
                    lsplit2 = lsplit[0].rstrip().split("|")
                    p_key = lsplit2[-1]
                else:
                    if line.startswith("allele"):
                        continue
                    lsplit = line.rstrip().split("\t")
                    p_key = lsplit[1]

                if len(lsplit) < 5:
                    continue

                if method == "mhcflurry":
                    s_key = "|".join([lsplit[3], lsplit[1], lsplit[-1]])
                elif p_class == 1:
                    s_key = "|".join([lsplit[0], lsplit[2], lsplit[4]])
                else:
                    s_key = "|".join([lsplit[0], lsplit[2]])

                id_to_gene[p_key][s_key] = line.rstrip()

        with open(mutations_path + f, "r") as st:
            for line in st:
                if line.startswith(">"):
                    k = "|".join(line.rstrip().split("|")[0:-1])
                    try:
                        header_to_id[k].append(
                            int(line.rstrip().split("|")[-1]))
                    except:
                        header_to_id[k] = []
                        header_to_id[k].append(
                            int(line.rstrip().split("|")[-1]))

        for key in header_to_id.keys():
            sample, nm, np, annotation, gene, hgvs_c, hgvs_p, variant, genotype = key.split(
                "|")

            if len(header_to_id[key]) % 2 == 0:
                for i in range(0, len(header_to_id[key]), 2):

                    hti = header_to_id[key][i:i + 2]
                    ref_id = min(hti)
                    alt_id = max(hti)

                    ref = id_to_gene[str(ref_id)]
                    alt = id_to_gene[str(alt_id)]

                    for prediction in alt.keys():
                        if prediction not in ref.keys(
                        ) or prediction not in alt.keys():
                            continue

                        if method == "mhcflurry":
                            ref_prediction = ref[prediction].split(",")
                            alt_prediction = alt[prediction].split(",")
                            ic50 = 4
                        else:
                            ref_prediction = ref[prediction].split("\t")
                            alt_prediction = alt[prediction].split("\t")
                            ic50 = 6

                        if float(ref_prediction[ic50]) < 500:
                            continue

                        if p_class == 1:
                            if ref_prediction[5] == alt_prediction[5]:
                                continue
                        else:
                            if ref_prediction[4] == alt_prediction[4]:
                                continue

                        if float(alt_prediction[ic50]) < 50:
                            rank = "STRONG BINDER"
                        elif float(alt_prediction[ic50]) >= 50 and float(
                                alt_prediction[ic50]) < 250:
                            rank = "INTERMEDIATE BINDER"
                        elif float(alt_prediction[ic50]) >= 250 and float(
                                alt_prediction[ic50]) < 500:
                            rank = "WEAK BINDER"
                        else:
                            rank = "NON BINDER"

                        dai = float(ref_prediction[ic50]) - float(
                            alt_prediction[ic50])

                        if method == "mhcflurry":
                            merged_report = "\t".join([
                                sample[1:], gene, variant, genotype, hgvs_c,
                                hgvs_p, nm, np, annotation, alt_prediction[3],
                                alt_prediction[-1], ref_prediction[2]
                            ])
                            merged_report += "\t" + "\t".join([
                                ref_prediction[ic50], alt_prediction[2],
                                alt_prediction[ic50],
                                str(dai), rank
                            ])
                        elif p_class == 1:
                            merged_report = "\t".join([
                                sample[1:], gene, variant, genotype, hgvs_c,
                                hgvs_p, nm, np, annotation, alt_prediction[0],
                                alt_prediction[4], ref_prediction[5]
                            ])
                            merged_report += "\t" + "\t".join([
                                ref_prediction[ic50], alt_prediction[5],
                                alt_prediction[ic50],
                                str(dai), rank
                            ])
                        else:
                            merged_report = "\t".join([
                                sample[1:], gene, variant, genotype, hgvs_c,
                                hgvs_p, nm, np, annotation, alt_prediction[0],
                                ref_prediction[4]
                            ])
                            merged_report += "\t" + "\t".join([
                                ref_prediction[ic50], alt_prediction[4],
                                alt_prediction[ic50],
                                str(dai), rank
                            ])

                        not_filtered_results.add(merged_report)

            else:
                alt_id = max(header_to_id[key])
                alt = id_to_gene[str(alt_id)]

                for prediction in alt.keys():

                    alt_prediction = alt[prediction].split("\t")
                    if method == "mhcflurry":
                        merged_report = "\t".join([
                            sample[1:], gene, variant, genotype, hgvs_c,
                            hgvs_p, nm, np, annotation, alt_prediction[3],
                            alt_prediction[-1], "NA"
                        ])
                    else:
                        merged_report = "\t".join([
                            sample[1:], gene, variant, genotype, hgvs_c,
                            hgvs_p, nm, np, annotation, alt_prediction[0],
                            alt_prediction[4], "NA"
                        ])

                    ic50 = 6

                    if float(alt_prediction[ic50]) < 50:
                        rank = "STRONG BINDER"
                    elif float(alt_prediction[ic50]) >= 50 and float(
                            alt_prediction[ic50]) < 250:
                        rank = "INTERMEDIATE BINDER"
                    elif float(alt_prediction[ic50]) >= 250 and float(
                            alt_prediction[ic50]) < 500:
                        rank = "WEAK BINDER"
                    else:
                        rank = "NON BINDER"

                    dai = "NA"
                    merged_report += "\t" + "\t".join([
                        "NA", alt_prediction[5], alt_prediction[ic50],
                        str(dai), rank
                    ])

                    not_filtered_results.add(merged_report)

        if p_class == 1:
            nf_header = "\t".join([
                "sample", "gene", "variant", "genotype", "hgvs_c", "hgvs_p",
                "nm", "np", "annotation", "allele", "len", "ref_peptide",
                "ref_ic50", "alt_peptide", "alt_ic50", "dai", "classification"
            ])
        else:
            nf_header = "\t".join([
                "sample", "gene", "variant", "genotype", "hgvs_c", "hgvs_p",
                "nm", "np", "annotation", "allele", "ref_peptide", "ref_ic50",
                "alt_peptide", "alt_ic50", "dai", "classification"
            ])

        if not_filtered_results:
            out = open(nf_predictions_path + f, "w")
            out.write(nf_header + "\n")
            out.write("\n".join(not_filtered_results))
            out.close()
            return True
        else:
            return False

    system("rm -rf " + raw_predictions_path)

    util.print_status(util.TASK_SUCCESS)
Example #14
0
    #==========[ Step 4: draw image	]==========
    cv2.namedWindow('DISPLAY')
    cv.SetMouseCallback('DISPLAY', on_mouse, param=harris_corners)

    #==========[ Step 4: have user mark keypoints ]==========
    while True:

        disp_image = cv2.drawKeypoints(image,
                                       harris_corners,
                                       color=(0, 0, 255))
        disp_image = cv2.drawKeypoints(disp_image,
                                       corner_keypoints,
                                       color=(255, 0, 0))

        cv2.imshow('DISPLAY', disp_image)
        key = cv2.waitKey(30)
        if key == 27:
            break

    #==========[ Step 5: get descriptors for each corner point	]==========
    print_status('MarkImage', 'getting SIFT descriptors for clicked corners')
    desc = CVAnalysis.get_sift_descriptors(image, corner_keypoints)
    corner_sift_desc = desc

    #==========[ Step 6: construct BoardImage	]==========
    print_status('MarkImage', 'constructing BoardImage object')
    board = Board(image=image,
                  name=image_name,
                  board_points=corner_board_points,
                  image_points=corner_image_points,
                  sift_desc=corner_sift_desc)
Example #15
0
 def __init__(self, msg):
     util.print_status(util.TASK_ERROR)
     super(MException, self).__init__(msg)
def execute(opts):

    input = opts[util.FASTQ_INPUT_OPTION]
    output = opts[util.OUTPUT_OPTION] + util.GENE_EXPRESSION
    index = util.HUMAN_TRANSCRIPTS_INDEX

    directory = os.path.dirname(output)
    if not os.path.exists(directory):
        os.makedirs(directory)

    util.print_task(util.TASK_GENE_EXPRESSION)

    ids = defaultdict(list)

    for f in input:
        sample = f.split("/")[-1].split("_")[0]
        ids[sample].append(f)

    for sample in ids.keys():
        if ids[sample][0].find("read") >= 0:
            end = " --single -l " + str(200) + " -s " + str(20) + " "
        else:
            end = " "

        files = " ".join(ids[sample])
        cmd = "kallisto quant -i " + index + " -o " + output + end + files

        try:
            system(cmd)
        except Exception as e:
            raise QuantifyingExpressionException(str(e))

        mg = mygene.MyGeneInfo()

        symbols = pd.read_csv(output + "abundance.tsv",
                              sep='\t',
                              header=0,
                              usecols=['target_id', 'tpm'])
        symbols['target_id'] = symbols['target_id'].str.split('.').str[0]

        sy = mg.querymany(symbols['target_id'],
                          scopes='all',
                          fields='symbol',
                          species='human',
                          verbose=False,
                          as_dataframe=True)
        df = pd.merge(symbols, sy, left_on="target_id", right_on="query").drop(
            columns=['_id', '_score', 'notfound']).drop_duplicates()
        df2 = df.groupby([
            'symbol', 'target_id'
        ])['tpm'].sum().reset_index(name='tpm').drop_duplicates()
        df2.columns = ['symbol', 'transcript', 'tpm']

        df2['tpm'] = pd.to_numeric(df2['tpm'])
        df2[df2['tpm'] > 1].to_csv(output + sample + ".tsv",
                                   sep='\t',
                                   index=False)

        os.remove(output + "abundance.tsv")
        os.remove(output + "abundance.h5")
        os.remove(output + "run_info.json")

        util.print_status(util.TASK_SUCCESS)
Example #17
0
def extract(vcf_info):

    util.print_task(util.TASK_EXTRACT_VCF_INFO)

    mg = mygene.MyGeneInfo()

    out_file = vcf_info + ".ExtractedInfo.txt"
    out_string = []

    if os.path.isdir(vcf_info):
        files = [
            path.join(vcf_info, f) for f in listdir(vcf_info)
            if path.isfile(path.join(vcf_info, f))
        ]
    else:
        files = []
        files.append(vcf_info)

    for vcf in files:
        if vcf.endswith(".annotated"):
            with open(vcf, "r") as f:
                samples = {}
                line_count = 1

                for line in f:
                    try:
                        if line.startswith("##"):
                            line_count += 1
                            continue

                        linesplit = line.rstrip().split("\t")
                        if line.startswith("#"):
                            for i in range(9, len(linesplit)):
                                samples[i] = linesplit[i]
                        else:
                            try:
                                infos = linesplit[7].split(",")
                                for i in infos:
                                    mut = i.split("ANN=")[-1]
                                    infosplit = mut.split("|")
                                    for key in samples.keys():
                                        if linesplit[key].split(":")[0].find(
                                                "1") >= 0:
                                            out_string.append("\t".join([
                                                samples[key], infosplit[1],
                                                infosplit[3], infosplit[6],
                                                infosplit[9], infosplit[10],
                                                linesplit[2],
                                                linesplit[key].split(":")[0]
                                            ]))
                            except:
                                continue

                    except Exception as e:
                        util.print_status(util.TASK_ERROR)
                        msg = util.REPORT + str(e) + "\n\tline: " + str(
                            line_count) + " | \"" + line.rstrip() + "\"\n"
                        raise VCFWrongFormat(msg)

                    line_count += 1

    sorted_out_string = sorted(out_string)
    df = pd.DataFrame([sub.split("\t") for sub in sorted_out_string],
                      columns=[
                          "Sample", "Annotation", "Gene", "Transcript",
                          "HGVS.c", "HGVS.p", "Variant", "Genotype"
                      ]).drop_duplicates()
    df = df[df['Transcript'].str.contains('NM')]
    df['Transcript'] = df['Transcript'].str.split('.').str[0]
    df.to_csv(out_file, sep='\t', index=False)

    util.print_status(util.TASK_SUCCESS)

    return out_file
Example #18
0
#=====[ our modules	]=====
from BoardImage import BoardImage
from CVAnalyzer import CVAnalyzer
from Board import Board
from util import print_welcome, print_message, print_status

#=====[ globals ]=====
board_image_dir = '../data/marked'


if __name__ == "__main__":
	print_welcome ()

	#==========[ Step 1: get board_image ]==========
	print_status ("Main", "loading board image")
	bi_filename = os.path.join (board_image_dir, 'micah1.bi')
	# bi_filename = os.path.join (board_image_dir, 'above.bi')
	board_image = BoardImage (filename=bi_filename)

	#==========[ Step 2: construct cv_analyzer, get BIH ]==========
	print_status ("Main", "creating cv_analyzer")
	cv_analyzer = CVAnalyzer ()
	print_status ("Main", "finding BIH (board-image homography)")
	BIH	= cv_analyzer.find_board_image_homography (board_image)

	#==========[ Step 3: construct the board	]==========
	print_status ("Main", "constructing the board")
	board = Board (BIH)

	#==========[ Step 4: draw squares on image	]==========