def get_sw_scores(): res = {} with open('data/scores.sw') as f: for line in f: line = line.strip() if line.startswith('query:'): cur_prot = line[6:] res[cur_prot] = {} elif line.startswith('score: '): it = line[7:].split(' -- ') res[cur_prot][it[1]] = int(it[0]) p = list(res.keys()) s = list() for i in range(len(p)): scores = np.zeros((len(p), ), dtype=np.float32) for j in range(len(p)): norm = max(res[p[i]][p[i]], res[p[j]][p[j]]) scores[j] = res[p[i]][p[j]] / norm s.append(scores) df = pd.DataFrame({'proteins': p, 'scores': s}) prots, sequences = read_fasta(open('data/swissprot.fasta', 'r')) prots_dict = {} for prot, seq in zip(prots, sequences): prots_dict[prot] = seq sequences = list() for prot in p: sequences.append(prots_dict[prot]) df['sequences'] = sequences df.to_pickle('data/sw_scores.pkl') return res
def pick_long_reads(files, length, outputfile, args): totalcount = 0 passedcount = 0 for fastafile in files: for header, seq in read_fasta(fastafile, False): totalcount = totalcount + 1 if len(seq) >= length: longread = True if args.cut_stars: seq = seq.split('*') tmp_length = 0 for i in range(0, len(seq)): if len(seq[i]) > tmp_length: tmp_length = len(seq[i]) j = i if tmp_length < length: print header, tmp_length longread = False seq = seq[j] if longread: passedcount = passedcount + 1 if header.startswith("Contig"): filename = (fastafile.split("/")[-1]).split(".")[0] header = filename + "_" + header outputfile.write('>%s\n%s\n' % (header, seq)) outputfile.close() print 'searched %s sequences.\n \ %s of them were longer than %s' \ %(totalcount,passedcount,length)
def build_index(args): i_file = args.input utils.logging("[INFO] Start downloading reference file.", args) tempbase = utils.gen_file() utils.mkdir(tempbase) reffile = os.path.join(tempbase, "raw.fa") utils.read_hdfs(i_file, reffile) tempfiles = [ open(os.path.join(tempbase, "%s.fa" % m), 'w') for m in conv_way ] utils.logging("[INFO] Start transforming reference file.", args) # read ref for chrid, seq in utils.read_fasta(reffile): for i, method in enumerate(conv_way): (strand, a_from, a_to) = (method[0], method[2], method[4]) if strand == "W": tempfiles[i].write( ">%s\n%s\n" % (chrid, seq.translate(utils.make_trans_with(strand, a_from, a_to)))) else: tempfiles[i].write( ">%s\n%s\n" % (chrid, seq.translate(utils.make_trans_with(strand, a_from, a_to))[::-1])) # close all files for i, method in enumerate(conv_way): tempfiles[i].close() utils.logging("[INFO] Start launching bowtie2-build.", args) # run bowtie jobs procs = [] utils.mkdir(os.path.join(tempbase, "index")) for i, method in enumerate(conv_way): out_pref = os.path.join(tempbase, "index", method) build_log = out_pref + ".build.log" proc = Process(target=call_bowtie, args=( tempfiles[i].name, out_pref, build_log, )) procs.append(proc) proc.start() for proc in procs: proc.join() utils.logging("[INFO] Start uploading index file.", args) # move to hdfs utils.copy_to_hdfs(tempbase, args.output, remove_original=True)
def align(sc, args): import utils as g_utils import align_utils as a_utils ## broadcast raw reference ref_file = os.path.join(args.tempbase, "ref.fa") g_utils.read_hdfs(os.path.join(args.ref, "raw.fa"), ref_file) ref_dict = {} for chrid, seq in g_utils.read_fasta(ref_file): ref_dict[chrid] = (seq, len(seq)) g_utils.logging("[DEBUG] loading reference done", args) bc_refdict = sc.broadcast(ref_dict) ## read from hadoop readRDD = sc.textFile( args.input ) \ .map( lambda x: g_utils.line2kv( x)) if args.testmode == "balancing": readRDD = readRDD.partitionBy(args.nodes) readRDD = readRDD.cache() ## transform and get result of bowtie c2tTransRDD = readRDD.mapValues(lambda x: (x[0].translate( g_utils.make_trans_with("W", "C", "T")), x[1])) c2tMapRDD = c2tTransRDD.mapPartitionsWithIndex( lambda i, ptn: a_utils.mapping(i, "C2T", ["W_C2T", "C_C2T"], ptn, args )) g2aTransRDD = readRDD.mapValues(lambda x: (x[0].translate( g_utils.make_trans_with("W", "G", "A")), x[1])) g2aMapRDD = g2aTransRDD.mapPartitionsWithIndex( lambda i, ptn: a_utils.mapping(i, "G2A", ["W_G2A", "C_G2A"], ptn, args )) mergedRDD = sc.union([readRDD, c2tMapRDD, g2aMapRDD]) combRDD = mergedRDD.combineByKey( lambda v: [v],\ lambda lst, v: lst + [v],\ lambda l1, l2: l1 + l2 ) filteredRDD = combRDD.mapValues( lambda x: a_utils.select_and_find_uniq_alignment( x))\ .filter( lambda (k, v): v is not None ) # .filter( lambda (k, v): not (v is None)) if args.testmode == "balancing": filteredRDD = filteredRDD.partitionBy(args.nodes) methylRDD = filteredRDD.map( lambda x: a_utils.calc_methyl(x, bc_refdict.value, args.num_mm) )\ .filter( lambda x: x is not None ) result_path = os.path.join(args.output, "alignment") methylRDD.map(lambda x: a_utils.res_to_string(x)).saveAsTextFile( result_path) return result_path
def retrieve_orfs(orfs,fastaFile,orfFile): orfFile = open(orfFile,'w') for header,seq in read_fasta(fastaFile): header = header.split()[0] if header in orfs: if orfs[header][2] == '+': orfFile.write('>%s\n%s\n' %(header,seq[orfs[header][0]-1:orfs[header][1]])) else: rev_seq = reverse_complement(seq[orfs[header][0]-1:orfs[header][1]]) orfFile.write('>%s\n%s\n' %(header,rev_seq))
def tally_gc_counts(fasta_dict=read_fasta()): results_dict = dict() for k in sorted(fasta_dict.keys()): counter = collections.Counter(fasta_dict[k]) total = sum(counter.values()) gc_count = counter['G'] + counter['C'] gc_percent = 100.0 * gc_count / total results_dict[k] = gc_percent return sorted(results_dict.items(), key=results_dict.get)[-1]
def run(args): fasta = read_fasta(args.fasta, args.logfile) gtf = read_gtf(args.gtf, args.logfile) out("Random Indexing Test:", args.logfile) for i, (ID, sequence) in enumerate(fasta.items()): print(ID) if "|" in ID: ID = ID.split("|")[3] gtf_items = gtf[ID] if not gtf_items: ID = ".".join(ID.split(".")[:-1]) gtf_items = gtf[ID] print(ID) gtf_items = gtf[ID] exons = [(start, end) for _, start, end, _ in gtf_items] print(gtf_items) strand = gtf_items[0][3] chrom = gtf_items[0][0] out("Transcript ID: {}".format(ID), args.logfile) out( "len: {:4d} sequence[:20]: {}".format(len(sequence), sequence[:20]), args.logfile) out("gtf_items: {}".format(gtf_items), args.logfile) # DEBUG: Checking that the exon lengths sum up to the length of the sequence total_length = 0 for _, start, stop, _ in gtf_items: assert stop > start total_length += stop - start assert total_length == len(sequence), "{} != {}\n{}\n".format( total_length, len(sequence), ID, gtf_items) assert strand == "+" or strand == "-", "{}".format(strand) out("exons: {}".format(exons), args.logfile) for _ in range(10): queried_index = np.random.randint(0, len(sequence)) genomic_index = find_location(queried_index, exons, strand, sequence) out( "Queried Index: {}\tGenomic index: {}".format( queried_index, genomic_index), args.logfile) out( "Transcript window around queried index: {}".format( sequence[queried_index - 10:queried_index + 11]), args.logfile) out("", args.logfile) if i > 20: exit() out("", args.logfile)
def train(args): ''' This is where the model is created & trained. ''' # Loading the protein fasta sequences orig_seqs = [] with open(args.input_file, 'r') as infasta: for _, seq in utils.read_fasta(infasta): orig_seqs.append(seq) # Prepare training data data_x, data_y, vocab_size, vocab_decode = \ utils.load_data(orig_seqs, args.seq_length) # Creating the RNN-LSTM model rnn_model = Sequential() # Add first layer with input shape provided rnn_model.add( LSTM(args.hidden_dim, input_shape=(None, vocab_size), return_sequences=True)) # Add the remaining layers for i in range(args.num_layers - 1): rnn_model.add(LSTM(args.hidden_dim, return_sequences=True)) # We treat each char in the vocbulary as an independet time step # and use TimeDistributed wrapper to apply the same dense layer (with # number of units = vocab_size) and same weights to output one # time step from the sequence for each time step in the input. # In other words, we will process all the time steps of the input # sequence at the same time. rnn_model.add(TimeDistributed(Dense(vocab_size))) rnn_model.add(Activation('softmax')) rnn_model.compile(loss="categorical_crossentropy", optimizer="rmsprop") # Train the model rnn_model.fit(data_x, data_y, batch_size=args.batch_size, verbose=1, epochs=args.train_epochs) # Generate new sequences proteins = [] for i in range(args.generate_epochs): new_pep = utils.generate_seq(rnn_model, args.seq_length, vocab_size, vocab_decode) proteins.append(new_pep) # Write new protein sequences to a fasta file utils.write_fasta(proteins, args.output_file)
def load_fasta_sequences(fasta_file1, fasta_file2, fasta1_to_fasta2_blast): fasta_sequences = { ''.join(os.path.split(filename)[-1].split('.')[:-1]): [[headerStr, seq] for headerStr, seq in read_fasta(filename)] for filename in [fasta_file1, fasta_file2] } blast_results = load_blast_results(fasta1_to_fasta2_blast, [0, 1, 2]) fasta_sources = list(fasta_sequences.keys()) output_name = '-'.join( ['targets', f'{fasta_sources[1].split("_")[-1]}', 'blast']) cols = [v[0] for v in fasta_sequences[fasta_sources[0]]] rows = [v[0] for v in fasta_sequences[fasta_sources[1]]] values = pd.DataFrame(0.0, index=rows, columns=cols) for val in blast_results: values.at[val[1], val[0]] = val[2] values.to_csv(output_name + '.csv')
def main(): parser = argparse.ArgumentParser( description='Adopt pairwise aligner to generate identity matrix') parser.add_argument('fasta_path', type=str, help='the path of fasta files to parse') parser.add_argument('out_path', type=str, help='output dir') args = parser.parse_args([ '/home/ZwZ/database/M/linsi/M1_M2_unique.fasta', '/home/ZwZ/database/M/linsi/M1_M2_unique.npy' ]) fasta = read_fasta(args.fasta_path) identity_M = identity(fasta) np.save(args.out_path, identity_M)
def bx_multiplicity(rfile): """ Calculate barcode multiplicity given a read file name. Returns a dictionary of barcode -> num reads with that barcode. """ bxs = {} if rfile == "-": rfile = "/dev/stdin" with open(rfile) as reads: if not reads.isatty(): for _, _, bx, _ in read_fasta(reads): if bx != None: bxs.setdefault(bx, 0) bxs[bx] += 1 else: raise RuntimeError( "Reads must be piped from stdin if file name is not provided") return bxs
def insert_peptide_sequences_into_db(self, peptide_fasta): """ Insert peptide sequences into DB using peptide FASTA file. NOTE: Risk for high memory consumption as entire FASTA is kept in memory. """ sequences = { header.split()[0]: sequence for header, sequence in read_fasta(peptide_fasta, keep_formatting=False) } discriminative_peptides = self.db.execute( "SELECT peptide FROM peptides").fetchall() for peptide in discriminative_peptides: self.db.execute( "UPDATE peptides SET sequence = ? WHERE peptide = ?", (sequences[peptide[0]], peptide[0])) #logging.debug("Added sequence %s for peptide %s", sequences[peptide[0]], peptide[0]) # TODO: verbose self.db.commit()
def main(): parser = argparse.ArgumentParser( 'filter out identical sequence and write a clean fasta file') parser.add_argument( 'in_path', type=str, help='please give an absolute path of in put fasta file', default='/home/ZwZ/database/M/linsi/M1_M2_all.fasta') parser.add_argument('out_path', type=str, help='the absolute path of filtered fasta file', default='/home/ZwZ/database/M/linsi/filter.fasta') args = parser.parse_args() ALL = utils.read_fasta(args.in_path, True) keep, discard, _ = search_identical(ALL) # the most important func all = [ALL[i] for i in keep] SeqIO.write(all, args.out_path, 'fasta')
def load_data(): ngram_df = pd.read_pickle('data/ngrams.pkl') vocab = {} for key, gram in enumerate(ngram_df['ngrams']): vocab[gram] = key + 1 gram_len = len(ngram_df['ngrams'][0]) print('Gram length:', gram_len) print('Vocabulary size:', len(vocab)) ngrams = list() proteins = list() f = open('data/swissprot.fasta') prots, seqs = read_fasta(f.readlines()) for protein, seq in zip(prots, seqs): if not is_ok(seq) or len(seq) - gram_len + 1 > MAXLEN: continue proteins.append(protein) grams = list() for i in range(len(seq) - gram_len + 1): grams.append(vocab[seq[i: (i + gram_len)]]) ngrams.append(grams) df = pd.DataFrame({ 'proteins': proteins, 'ngrams': ngrams, }) def get_values(df): grows = [] gcols = [] gdata = [] for i, row in enumerate(df.itertuples()): for j in range(len(row.ngrams)): grows.append(i) gcols.append(j) gdata.append(row.ngrams[j]) data = sparse.csr_matrix((gdata, (grows, gcols)), shape=(len(df), MAXLEN)) return data return proteins, get_values(df)
def main(): min = 200 max = 370 #parameters that used mis = 0 #read primer fpri, rpri = utils.read_trad_primer(sys.argv[1]) lfile = sys.argv[2] num = 0 seq = utils.read_fasta(sys.argv[2]) for item in seq.items(): se = item[1] for f in fpri: for i in range(0, len(se)-len(f)): str1 = se[i:len(f)+i] #print str1, f #print mismatch(str1,f,mis) if mismatch(str1,f,mis): #print "match" num += 1 output = ">F:"+lfile+"_"+str(num)+"\n"+f+'\n' #print output #print ">F:",f, i for r in rpri: rp = r for j in range(0, len(se)-len(rp)): str2 = se[j:len(rp)+j] if mismatch(str2,rp,mis): #print "match" frp = reverse_complement.get_rc(rp) #print ">R:",frp,j+len(rp), j+len(rp)-i flen = j+len(rp)-i #print flen if(flen >min and flen<max): output = output + ">R:"+lfile+"_"+str(num)+"\n"+frp+'\n' print output
def _read(self, f): return utils.read_fasta(f)
parser.add_argument("--step", "-s", default=15, help="the length between each two fragments") parser.add_argument("--flen", "-fl", default=53, help="the length for each fragment") parser.add_argument("--prob", "-p", default=0.5, help="the probability for selecting a target site") parser.add_argument("--numsite", "-ns", default=0, help="the number of target sites") args = parser.parse_args() fasta_name = args.input1 onemirna = args.input2 outf = args.output step = args.step fraglen = args.flen prob = args.prob nums = args.numsite headers, mrnaseqs = read_fasta(fasta_name) # prepare the input data mirnaf = open(onemirna, 'r') mirnaseq = mirnaf.read().rstrip('\r\n').upper() print(mirnaseq) remirna = "".join(reversed(mirnaseq)) mirnaseq = remirna print(remirna) maxlen = 79 maxlenmir = 26 mirnalen = len(mirnaseq) if mirnalen < maxlenmir:
import numpy as np import random #read in sites posfile = '/Users/student/Documents/Algorithms/Alg_final_project/data/rap1-lieb-positives.txt' negfile = '/Users/student/Documents/Algorithms/Alg_final_project/data/yeast-upstream-1k-negative.fa' testfile = '/Users/student/Documents/Algorithms/Alg_final_project/data/rap1-lieb-test.txt' poslist = util.read_pos(posfile) finaltestlist = util.read_pos(testfile) posreversecomp = [] for i in poslist: posreversecomp.append(util.reverse_complement(i)) poslist = poslist + posreversecomp neglist = util.read_fasta(negfile) negreversecomp = [] for i in neglist: negreversecomp.append(util.reverse_complement(i)) neglist = neglist + negreversecomp for i in neglist: if i in set(poslist): neglist.remove(i) #print('negs',neglist[:10]) print('neg', len(neglist)) print('pos', len(poslist)) shortneg = [] for i in neglist: #TODO adapt so this is a random slice of the negative
def _read(self, f): s, t = utils.read_fasta(f, dna_only=True) return s, t
def load_sequences(self) -> list: self.initial_alignment_df = read_fasta(self.file_name) return [sequence for sequence in self.initial_alignment_df.get("sequence")]
parser.add_argument("--numsite", "-ns", default=0, help="the number of called target sites") args = parser.parse_args() fasta_mrna = args.input1 fasta_mirna = args.input2 outf = args.output step = args.step fraglen = args.flen prob = args.prob nums = args.numsite hmrna, mrnaseqs = read_fasta(fasta_mrna) hmirna, mirnaseqs = read_fasta(fasta_mirna) # prepare the input data remirnas = [] for one in mirnaseqs: oneremirna = "".join(reversed(one)) remirnas.append(oneremirna.upper()) print(mirnaseqs[0]) print(remirnas[0]) mirnaseqs = remirnas maxlen = 79 maxlenmir = 26 mirnalen = len(min(mirnaseqs, key=len))
import numpy as np from utils import read_fasta, estimate_population, generate_population initial_alignment_df = read_fasta(filename='dataset/BB11001-m.fa') # initial_alignment = [np.fromstring(sequence, dtype=np.uint8) for sequence in initial_alignment_df.get("sequence")] initial_alignment = [ sequence for sequence in initial_alignment_df.get("sequence") ] # initial_alignment = np.array(initial_alignment) population = generate_population(size=20, alignment=initial_alignment) print(population) print(estimate_population(population)) # # print(f'Estimated sequence value: {estimated_value}')
def main(args): # Mapping COG id to proteins cog_prot = defaultdict(lambda: defaultdict(list)) # Mapping protein to COG ids prot_cog = defaultdict(lambda: defaultdict(list)) if not os.path.exists(args.dest): print >> sys.stderr, 'Creating directory %s' % args.dest os.makedirs(args.dest) if not os.path.isdir(args.dest): sys.exit('Destination is not a directory') if args.cog_csv: cog_csv = (l.strip('\n').split(',') for l in args.cog_csv) else: if not os.path.exists('%s/cog2003-2014.csv' % args.dest): print >> sys.stderr, 'Downloading COG csv from NCBI' response = urllib2.urlopen( 'ftp://ftp.ncbi.nih.gov/pub/COG/COG2014/data/cog2003-2014.csv') with open('%s/cog2003-2014.csv' % args.dest, 'w') as outh: outh.write(response.read()) cog_csv = (l.strip('\n').split(',') for l in open('%s/cog2003-2014.csv' % args.dest, 'r')) for l in cog_csv: protid = l[2] cogid = l[6] spos = int(l[4]) epos = int(l[5]) cog_prot[cogid][protid] = merge_interval_list(cog_prot[cogid][protid] + [(spos, epos)], dist=1) prot_cog[protid][cogid] = merge_interval_list(prot_cog[protid][cogid] + [(spos, epos)], dist=1) print >> sys.stderr, "Found {:,} COGs".format(len(cog_prot)) print >> sys.stderr, "Found {:,} protein ids".format(len(prot_cog)) c = Counter(len(v) for k, v in prot_cog.iteritems()) print >> sys.stderr, "Proteins belonging to multiple COGs:" print >> sys.stderr, 'Num COGs | Count' for k in range(10): print >> sys.stderr, '%s%d' % (str(k + 1).ljust(12), c[k + 1]) print >> sys.stderr, '%s%d' % ('11+'.ljust(12), sum(v for k, v in c.iteritems() if k > 10)) if args.fasta: seqiter = ( (seqname, seq) for seqname, seq in read_fasta(gzip.GzipFile(fileobj=args.fasta))) else: if not os.path.exists('%s/prot2003-2014.fa.gz' % args.dest): print >> sys.stderr, 'Downloading protein sequences from NCBI' response = urllib2.urlopen( 'ftp://ftp.ncbi.nih.gov/pub/COG/COG2014/data/prot2003-2014.fa.gz' ) with open('%s/prot2003-2014.fa.gz' % args.dest, 'wb') as outh: outh.write(response.read()) seqiter = ((seqname, seq) for seqname, seq in read_fasta( gzip.open('%s/prot2003-2014.fa.gz' % args.dest, 'rb'))) # Clear out destination purge_msg = True for cogid in cog_prot.keys(): if os.path.exists('%s/%s/%s.faa' % (args.dest, cogid[:5], cogid)): if purge_msg: print >> sys.stderr, 'Purging existing sequence files' purge_msg = False os.remove('%s/%s/%s.faa' % (args.dest, cogid[:5], cogid)) # Create the COG files numseqs = 0 filenames = {} seqcounts = Counter() for seqname, seq in seqiter: try: m = re.search('gi\|(\d+)\|ref', seqname) protid = m.group(1) except (AttributeError, ValueError): print >> sys.stderr, 'Error parsing sequence name: "%s"' % seqname continue numseqs += 1 if not numseqs % 100000: print >> sys.stderr, 'Processed %d proteins...' % numseqs for cogid, ivs in prot_cog[protid].iteritems(): for istart, iend in ivs: newseqname = 'cog|%s|%s (%d-%d)' % (cogid, seqname, istart, iend) newseq = seq[istart - 1:iend] seqcounts[cogid] += 1 if not os.path.exists('%s/%s' % (args.dest, cogid[:5])): os.makedirs('%s/%s' % (args.dest, cogid[:5])) with open('%s/%s/%s.faa' % (args.dest, cogid[:5], cogid), 'a') as outh: print >> outh, '>%s\n%s' % (newseqname, fmt_seq(newseq)) filenames[cogid] = '%s/%s.faa' % (cogid[:5], cogid) print >> sys.stderr, "Processed %d total proteins" % numseqs with open('%s/fungroups.txt' % args.dest, 'w') as outh: for k in sorted(filenames.keys()): print >> outh, '%s\t%s\t%d' % (k, filenames[k], seqcounts[k])
the label of the next string. In Rosalind's implementation, a string in FASTA format will be labeled by the ID "Rosalind_xxxx", where "xxxx" denotes a four-digit code between 0000 and 9999. Given: At most 10 DNA strings in FASTA format (of length at most 1 kbp each). Return: The ID of the string having the highest GC-content, followed by the GC-content of that string. Rosalind allows for a default error of 0.001 in all decimal answers unless otherwise stated; please see the note on absolute error below. """ from utils import read_fasta, gc_content if __name__ == '__main__': sequences = read_fasta('data/rosalind_gc.txt') max_gc = 0 max_id = None for id, s in sequences.items(): gc = gc_content(s) if gc > max_gc: max_gc = gc max_id = id print max_id print max_gc * 100
downstream_model_names = [ '56_Linear.pth', '66_Linear.pth', '68_Linear.pth', '71_Linear.pth', '69_Linear.pth' ] downstream_models = [ torch.load(os.path.join(model_path, model_name), map_location=device) for model_name in downstream_model_names ] pdb_file = 'C:\\Workspace\\PE\\UI\\saved_models\\1emm_mod.pdb' atom_lines = utils.process_pdb(pdb_file, atoms_type=['N', 'CA', 'C', 'O']) coord_array_ca, acid_array, coord_array = utils.extract_coord( atom_lines, atoms_type=['N', 'CA', 'C', 'O']) utils.compare_len(coord_array, acid_array, ['N', 'CA', 'C', 'O']) seq_dict = utils.read_fasta(seq_file) with open(os.path.join(files_path, 'results.txt'), 'a') as f: f.write('sequence_name\tpredicted_value\n') for seq_name, seq in seq_dict.items(): seq_dict[seq_name] = utils.seq2array(seq) array = utils.get_knn_135(coord_array, seq) input_ = torch.tensor(array.reshape(-1, 135)).float() hidden = pretrained_model(input_).squeeze(1) output = 0 for model in downstream_models: model.eval() model.is_training = False output += model(hidden)
def write_fasta(self): self.initial_alignment_df = read_fasta(self.file_name) with open("BB11001.aln", "w") as file_handler: for index, id in enumerate(self.initial_alignment_df.get("id")): file_handler.write(">" + id + "\n") file_handler.write(self.best_solution[index] + "\n")
def parse_orfs(orfFinderOut,orfFile,minLength): orfOut = open(orfFile,'w') for header, seq in read_fasta(orfFinderOut,False): if len(seq) > minLength: seq = find_common_startcodon(seq,minLength) orfOut.write('>%s\n%s\n' %(header,seq))
parser.add_argument('--depth', default=7, help="min,max") parser.add_argument('--period', default=1, help="min, max") parser.add_argument('-k', default=1, help="k-fold validation") parser.add_argument('model', nargs='+') args = parser.parse_args() seqs = [] labels = dict() all_labels = set() for f in args.model: label = ".".join(os.path.split(f)[-1].split(".")[:-1]) all_labels.add(label) for seq in utils.read_fasta(f): seqs.append(seq) labels[seq["seqid"]] = label best_args = [12, 7, 1, 100] print "width\tdepth\tperiod\terror_rate" for w in get_range(args.window): for d in get_range(args.depth): if w < d: continue for p in get_range(args.period): errors = utils.cross_validate(seqs, labels, int(args.k), window=w, depth=d, period=p) avg_error = sum(errors) / len(errors)
def main(go_file, train_sequences_file, train_annotations_file, test_sequences_file, test_annotations_file, out_terms_file, train_data_file, test_data_file, min_count): logging.info('Loading GO') go = Ontology(go_file, with_rels=True) logging.info('Loading training annotations') train_annots = {} with open(train_annotations_file, 'r') as f: for line in f: it = line.strip().split('\t') prot_id = it[0] if prot_id not in train_annots: train_annots[prot_id] = set() go_id = it[1] train_annots[prot_id].add(go_id) logging.info('Loading training sequences') info, seqs = read_fasta(train_sequences_file) proteins = [] sequences = [] annotations = [] for prot_info, sequence in zip(info, seqs): prot_id = prot_info.split()[0] if prot_id in train_annots: proteins.append(prot_id) sequences.append(sequence) annotations.append(train_annots[prot_id]) prop_annotations = [] cnt = Counter() for annots in annotations: # Propagate annotations annots_set = set() for go_id in annots: annots_set |= go.get_anchestors(go_id) prop_annotations.append(annots_set) for go_id in annots_set: cnt[go_id] += 1 df = pd.DataFrame({ 'proteins': proteins, 'sequences': sequences, 'annotations': prop_annotations, }) logging.info(f'Train proteins: {len(df)}') logging.info(f'Saving training data to {train_data_file}') df.to_pickle(train_data_file) # Filter terms with annotations more than min_count res = {} for key, val in cnt.items(): if val >= min_count: ont = key.split(':')[0] if ont not in res: res[ont] = [] res[ont].append(key) terms = [] for key, val in res.items(): terms += val logging.info(f'Number of terms {len(terms)}') logging.info(f'Saving terms to {out_terms_file}') df = pd.DataFrame({'terms': terms}) df.to_pickle(out_terms_file) logging.info('Loading testing annotations') test_annots = {} with open(test_annotations_file, 'r') as f: for line in f: it = line.strip().split('\t') prot_id = it[0] if prot_id not in test_annots: test_annots[prot_id] = set() go_id = it[1] test_annots[prot_id].add(go_id) logging.info('Loading testing sequences') info, seqs = read_fasta(test_sequences_file) proteins = [] sequences = [] annotations = [] for prot_info, sequence in zip(info, seqs): prot_id = prot_info.split()[0] if prot_id in test_annots: proteins.append(prot_id) sequences.append(sequence) annotations.append(test_annots[prot_id]) prop_annotations = [] for annots in annotations: # Propagate annotations annots_set = set() for go_id in annots: annots_set |= go.get_anchestors(go_id) prop_annotations.append(annots_set) df = pd.DataFrame({ 'proteins': proteins, 'sequences': sequences, 'annotations': prop_annotations, }) logging.info(f'Test proteins {len(df)}') logging.info(f'Saving testing data to {test_data_file}') df.to_pickle(test_data_file)
assert len(set(len(x) for x in fasta.values())) <= 1 profile = collections.defaultdict(lambda: [0] * seq_len) for i in range(seq_len): for base in 'ACGT': for dna_seq in fasta.values(): if dna_seq[i] == base: profile[base][i] += 1 return dict(profile) def calculate_consensus(): seq_len = len(list(fasta.values())[0]) consensus = [] profile = build_profile_matrix() for j in range(seq_len): max_count = 0, '' for base in profile: if profile[base][j] > max_count[0]: max_count = profile[base][j], base consensus.append(max_count) return ''.join([i[1] for i in consensus]) fasta = read_fasta() print(calculate_consensus()) for base, profile in build_profile_matrix().items(): print('%s: %s' % (base, ' '.join(str(i) for i in profile)))
def _read(self, f): dnas = [list(dna) for _, dna in utils.read_fasta(f)] return np.asarray(dnas)
from sys import argv, exit from utils import read_fasta as read_fasta if len(argv) < 2: print("usage: script.py FASTA discriminative_peptides.txt") exit(1) fastafile = argv[1] discpeps = argv[2] disc = {} with open(discpeps) as f: for line in f: if line.startswith("-"): break peptide, families = line.strip().split(" ", maxsplit=1) translation = str.maketrans("{'}", " ") families = families.translate(translation).split(",") disc[peptide] = [fam.strip() for fam in families] disc_seqs = {} for header, sequence in read_fasta(fastafile): try: disc_seqs[sequence] = ", ".join(disc[header]) except KeyError: pass for seq, target in disc_seqs.items(): print(seq, target, sep="\t")
def run(args): sequences = read_fasta(args.fasta, args.logfile) if args.fasta is not None else None if args.gtf is not None: ref = read_gtf(args.gtf, args.logfile) elif args.bed is not None: ref = read_bed(args.bed, args.logfile) if ref is None: raise NotImplementedError meRanCall = read_meRanCall(args.input, args.logfile) outfile = open(args.output, "w") errfile = open(args.error_out, "w") header = "# <chromosome> <genomic position> <strand> <methylation rate> <transcript ID> <transcript position>\n" outfile.write(header) sequence = None not_found_errors = 0 not_found_error_transcripts = set() sequence_length_check_errors = 0 sequence_length_check_error_transcripts = set() for ID, refPos, refStrand, methRate in meRanCall: if sequences is not None: sequence = sequences[ID] out("ID:{} position:{}".format(ID, refPos), args.logfile) if "|" in ID: ID = ID.split("|")[3] if ID not in ref: out("Error: Could not find ID {} in reference file".format(ID), args.logfile) errfile.write( "Error: Could not find ID {} in reference file\n".format(ID)) not_found_errors += 1 not_found_error_transcripts.add(ID) continue coordinates = ref[ID] assert refStrand == "+" if sequence is not None: total_length = 0 for _, start, stop, _ in coordinates: total_length += stop - start if total_length != len(sequence): out( "Error: The lengths of the exons do not match the overall length of the sequence: exons {} != seq {}" .format(total_length, len(sequence)), args.logfile) if len(sequence) < total_length: errfile.write( "Error: The lengths of the exons of ID {} do not match the overall length of the sequence: exons {} != seq {}\n" .format(ID, total_length, len(sequence))) sequence_length_check_errors += 1 sequence_length_check_error_transcripts.add(ID) continue out("Attempting to handle Poly-A...", args.logfile) length_diff = len(sequence) - total_length last_chunk = sequence[-length_diff:] if last_chunk == "A" * length_diff: sequence = sequence[:-length_diff] out("Handled this case by cutting off Poly-A", args.logfile) assert total_length == len(sequence) else: out("Could not handle this case by cutting off Poly-A", args.logfile) errfile.write( "Error: The lengths of the exons of ID {} do not match the overall length of the sequence: exons {} != seq {}\n" .format(ID, total_length, len(sequence))) sequence_length_check_errors += 1 sequence_length_check_error_transcripts.add(ID) continue strand = coordinates[0][3] chrom = coordinates[0][0] genomic_position = find_location(refPos, coordinates) line = "\t".join( (chrom, str(genomic_position), strand, str(methRate), ID, str(refPos))) + "\n" outfile.write(line) out("Finished writing to: {}".format(args.output), args.logfile) out( "Number of sites whose transcript was not found in the reference file: {}" .format(not_found_errors), args.logfile) out( "Number of sites whose transcript sequence length did not equal the sum of all the exons: {}" .format(sequence_length_check_errors), args.logfile) out("Total meRanCall lines: {}".format(len(meRanCall)), args.logfile) out("", args.logfile) out( "Number of transcripts that were not found in the reference file: {}". format(len(not_found_error_transcripts)), args.logfile) out( "Number of transcripts whose length did not equal the sum of all the exons: {}" .format(len(sequence_length_check_error_transcripts)), args.logfile) out( "Total number of transcripts read in the reference file: {}".format( len(ref)), args.logfile) errfile.write( "Number of sites whose transcript was not found in the reference file: {}\n" .format(not_found_errors)) errfile.write( "Number of sites whose transcript sequence length did not equal the sum of all the exons: {}\n" .format(sequence_length_check_errors)) errfile.write("Total meRanCall lines: {}\n".format(len(meRanCall))) errfile.write("\n") errfile.write( "Number of transcripts that were not found in the reference file: {}\n" .format(len(not_found_error_transcripts))) errfile.write( "Number of transcripts whose length did not equal the sum of all the exons: {}\n" .format(len(sequence_length_check_error_transcripts))) errfile.write( "Total number of transcripts read in the reference file: {}\n".format( len(ref)))
def _read(self, f): return utils.read_fasta(f, dna_only=True)[0]
letter, _ = c.most_common(1)[0] consensus += letter return consensus, profile def display(symbols, consensus, profile): s = consensus + "\n" for symb, counts in zip(symbols, profile): s = s + f"{symb}: {list_to_string(counts)}\n" return s if __name__ == "__main__": symbols = "ACGT" dnas = [ "ATCCAGCT", "GGGCAACT", "ATGGATCT", "AAGCAACC", "TTGGAACT", "ATGCCATT", "ATGGCACT", ] display(symbols, *profile(symbols, dnas)) dnas = read_fasta("inps/rosalind_cons.txt") print(dnas) s = display(symbols, *profile(symbols, dnas)) with open("outs/rosalind_cons.txt", "w") as f: f.write(s)