def main(): params = get_params() from pyHCA.core.ioHCA import read_multifasta_it AA_sorted = dict() AA1 = [ 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y' ] for i, aa in enumerate(AA1): AA_sorted[aa] = i + 1 if "X" not in AA_sorted: AA_sorted["X"] = 0 compute_features = compute_features3 # all_compute_features[params.method] scaler_model = joblib.load(params.model) rbs_scaler = scaler_model["scaler"] trained_clf = scaler_model["model"] with open(params.outputfile, "w") as outf: for prot, sequence in read_multifasta_it(params.fastafile): seq = str(sequence.seq).upper() domains, clusters = prepare_sequence(seq) features = compute_features(seq, domains, clusters, AA_sorted) features = rbs_scaler.transform(features) probas = trained_clf.predict_proba(features) outf.write(">{}\n".format(prot)) for i in range(len(probas)): outf.write("{} {} {}\n".format(i + 1, seq[i], probas[i, 0])) sys.exit(0)
def domain_sequence(inputf, domainf, outputf, verbose=False): """ get sequences of domain annotation """ # read domain annotation annotation = read_annotation(domainf, "seghca") with open(outputf, "w") as outf: for prot, sequence in read_multifasta_it(inputf): seq = str(sequence.seq) for i, domain in enumerate(annotation.get(prot, [])): start, stop = domain[:2] outf.write(">{}-{} {}-{}\n{}\n".format(prot, i, start+1, stop, seq[start: stop]))
def _scores(output, dseq, seq_type="aminoacid", t=0.1, method="domain", verbose=False, dist=16): """ The main annotation function. Two methods are avaliable: 'domain' and 'cluster' Parameters ---------- dseq : dictionary the biological sequences, keys are string, values are biopython Sequence object from SeqIO seq_type: string, ["aminoacids", "nucleotides"] the type of biological sequence t : float parameter controlling the domain creation based on cluster density method : string the method used, domain: will return a list of domain positions cluster: will return a list of cluster positions verbose: bool print interesting stuff Return: ------- danno : dictionarry the annotation for each protein or each frame of each nucleotide sequences """ with open(output, "w") as outf: if seq_type == "aminoacid": #for prot in dseq: for prot, sequence in read_multifasta_it(inputfile): #sequence = str(dseq[prot].seq) annotations = _annotation_aminoacids(sequence, t=t, method=method, verbose=verbose, dist=dist) outf.write(">{} {}\n".format(prot, len(sequence))) if method =="domain": posdomains = np.zeros(len(sequence), dtype=np.uint8) for domannot in annotations["domain"]: posdomains[domannot.start: domannot.stop] = 1 for i in range(len(sequence)): outf.write("{:.5f}\t{}\n".format(annotations["scores"][i], posdomains[i])) else: for i in range(len(sequence)): outf.write("{:.5f}\tNaN\n".format(annotations["scores"][i])) else: cnt, nb_dot = 0, 0 #for name in dseq: for name, sequence in read_multifasta_it(inputfile): for strand, frame, start, protseq in six_frames(sequence): cnt += 1 if cnt == 1000: cnt = 0 sys.stdout.write(".") sys.stdout.flush() nb_dot += 1 if nb_dot == 80: nb_dot = 0 sys.stdout.write("\n") if strand > 0: new_name = "{}_5'3'_Frame_{}_start_{}".format(name, frame+1, start+1) else: new_name = "{}_3'5'_Frame_{}_start_{}".format(name, frame+1, start+1) annotations = _annotation_aminoacids(protseq, t=t, method=method, verbose=verbose, dist=dist) if annotations: outf.write(">{} {}\n".format(new_name, len(protseq))) if method =="domain": posdomains = np.zeros(len(protseq), dtype=np.uint8) for domannot in annotations["domain"]: posdomains[domannot.start: domannot.stop] = 1 for i in range(len(protseq)): outf.write("{:.5f}\t{}\n".format(annotations["scores"][i], pos_domains[i])) else: for i in range(len(protseq)): outf.write("{:.5f}\tNaN\n".format(annotations["scores"][i])) sys.stdout.write("\n")
def _annotation(output, inputf, seq_type="aminoacid", t=0.1, method="domain", verbose=False): """ The main annotation function. Two methods are avaliable: 'domain' and 'cluster' Parameters ---------- inputf: string path of the input file seq_type: string, ["aminoacids", "nucleotides"] the type of biological sequence t : float parameter controlling the domain creation based on cluster density method : string the method used, domain: will return a list of domain positions cluster: will return a list of cluster positions verbose: bool print interesting stuff Return: ------- danno : dictionarry the annotation for each protein or each frame of each nucleotide sequences """ with open(output, "w") as outf: if seq_type == "aminoacid": for prot, sequence in read_multifasta_it(inputf, verbose): #for prot in dseq: #sequence = str(dseq[prot].seq) annotations = _annotation_aminoacids(sequence, t=t, method=method, verbose=verbose) outf.write(">{} {}\n".format(prot, len(sequence))) for domannot in annotations["domain"]: outf.write("{}\n".format(str(domannot))) for clustannot in annotations["cluster"]: outf.write("{}\n".format(str(clustannot))) else: cnt, nb_dot = 0, 0 #for name in dseq: for name, sequence in read_multifasta_it(path, verbose): #for strand, frame, start, protseq in six_frames(dseq[name]): for strand, frame, start, protseq in six_frames(sequence): cnt += 1 if cnt == 1000: cnt = 0 sys.stdout.write(".") sys.stdout.flush() nb_dot += 1 if nb_dot == 80: nb_dot = 0 sys.stdout.write("\n") if strand > 0: new_name = "{}_5'3'_Frame_{}_start_{}".format(name, frame+1, start+1) else: new_name = "{}_3'5'_Frame_{}_start_{}".format(name, frame+1, start+1) annotations = {"cluster": [], "domain": []} cur_annotation = _annotation_aminoacids(protseq, t=t, method=method, verbose=verbose) for domannot in cur_annotation["domain"]: annotations["domain"].append(domannot) for clustannot in cur_annotation["cluster"]: annotations["cluster"].append(clustannot) if annotations: outf.write(">{} {}\n".format(new_name, len(protseq))) for domannot in annotations["domain"]: outf.write("{}\n".format(str(domannot))) for clustannot in annotations["cluster"]: outf.write("{}\n".format(str(clustannot))) sys.stdout.write("\n")
def _annotation(output, inputf, seq_type="aminoacid", t=0.1, method="domain", verbose=False): """ The main annotation function. Two methods are avaliable: 'domain' and 'cluster' Parameters ---------- inputf: string path of the input file seq_type: string, ["aminoacids", "nucleotides"] the type of biological sequence t : float parameter controlling the domain creation based on cluster density method : string the method used, domain: will return a list of domain positions cluster: will return a list of cluster positions verbose: bool print interesting stuff Return: ------- danno : dictionarry the annotation for each protein or each frame of each nucleotide sequences """ with open(output, "w") as outf: outf.write("""# pyHCA v0.1 segmentation results # # Format: # # >'protein_id' 'protein_length' 'hca_score computed on the whole sequence' # domain 'domain_start' 'domain_stop' 'hca_score' 'hca_pvalue' (if -m domain is used) # cluster 'cluster_start' 'cluster_stop' 'cluster_pattern' # # The hca_score and associated p-value provide a way to measure the foldability # of a protein, i.e how similar is the score compared to scores from disordered # sequences. # Low p-values correspond to scores at the tail of the distribution of scores # for disordered protein sequences. # # /!\ Warning /!\ # 1- The score computed at the whole protein level (in the line with '>') is for # information only as some people found it useful. # No p-value is associated to this score as the scores used in the distributions # don't come from full protein sequences but domain or "disordered regions" of # comparable lengths. # # 2- similarly, scores are displayed even for HCA domain shorted than 30 amino # acids. # As the sequences of length lower than 30 amino acids where filtered out to # compute distributions of scores, no p-values are given. # # In these two cases, the scores provided must be analyzed carefully, keeping # in mind their origin and initial purpose # /!\ Warning /!\ # # """) if seq_type == "aminoacid": for prot, sequence in read_multifasta_it(inputf, verbose): #for prot in dseq: #sequence = str(dseq[prot].seq) annotations = _annotation_aminoacids(sequence, t=t, method=method, verbose=verbose) score, pvalue = compute_disstat(0, len(sequence), annotations["cluster"]) outf.write(">{} {} {:.3f} {:.3f}\n".format( prot, len(sequence), pvalue, score)) for domannot in annotations["domain"]: outf.write("{}\n".format(str(domannot))) for clustannot in annotations["cluster"]: outf.write("{}\n".format(str(clustannot))) else: cnt, nb_dot = 0, 0 #for name in dseq: for name, sequence in read_multifasta_it(path, verbose): #for strand, frame, start, protseq in six_frames(dseq[name]): for strand, frame, start, protseq in six_frames(sequence): cnt += 1 if cnt == 1000: cnt = 0 sys.stdout.write(".") sys.stdout.flush() nb_dot += 1 if nb_dot == 80: nb_dot = 0 sys.stdout.write("\n") if strand > 0: new_name = "{}_5'3'_Frame_{}_start_{}".format( name, frame + 1, start + 1) else: new_name = "{}_3'5'_Frame_{}_start_{}".format( name, frame + 1, start + 1) annotations = {"cluster": [], "domain": []} cur_annotation = _annotation_aminoacids(protseq, t=t, method=method, verbose=verbose) for domannot in cur_annotation["domain"]: annotations["domain"].append(domannot) for clustannot in cur_annotation["cluster"]: annotations["cluster"].append(clustannot) score, pvalue = compute_disstat(0, len(protseq), annotations["cluster"]) if annotations: outf.write(">{} {} {:.3f}\n".format( new_name, len(protseq), score)) for domannot in annotations["domain"]: outf.write("{}\n".format(str(domannot))) for clustannot in annotations["cluster"]: outf.write("{}\n".format(str(clustannot))) sys.stdout.write("\n")