def get_sequences(): fasta_filename = '/scratch/indexes/WS235.fa' sequences = dict((p.name.split(' ')[0], p.seq) for p in HTSeq.FastaReader(fasta_filename)) rc_sequences = dict((p.name.split(' ')[0], rc(p.seq)) for p in HTSeq.FastaReader(fasta_filename)) chr_lens = dict([(name, len(sequences[name])) for name in sequences]) return (sequences, rc_sequences, chr_lens)
def test_fasta_parser(): print("Test Fasta parser") for seq in HTSeq.FastaReader('example_data/fastaExLong.fa'): pass print("Test passed") print("Test Fasta parser (raw iterator)") for seq in HTSeq.FastaReader('example_data/fastaExLong.fa', raw_iterator=True): pass print("Test passed")
def gatherAllQueries(queryPath): alleleList = {} bestmatches = {} try: queryFilesOnDir = [ f for f in listdir(queryPath) if isfile(join(queryPath,f)) ] countFiles = 0 for queryFile in queryFilesOnDir: AllqueryFile = os.path.join(queryPath,queryFile) if queryFile == 'Allalleles.fasta': continue countFiles += 1 g_fp = HTSeq.FastaReader(os.path.join(queryPath,queryFile)) countAlleles = 0 for allele in g_fp: countAlleles += 1 #ToWrite.append(">" + str(countFiles) + '--' + str(countAlleles) +"\n"+ str(allele.seq).upper() + "\n") alleleList[str(countFiles) + '--' + str(countAlleles)] = str(allele.seq).upper() bestmatches[str(countFiles)] = [0,0,False,'','','',0,'', str(countAlleles), AllqueryFile] #To be used when searching for new alleles. On instance for each locus #Score, ScoreRatio, Found, queryName, HitName, MatchObject, lengthReference, lengthQuery, numberOfExistingAllelesForThatLocus #CreateNewAlleleFile(os.path.join(queryPath,'Allalleles.fasta'), ToWrite) except Exception: print 'An error occurred' return False , alleleList, bestmatches return True , alleleList, bestmatches
def create_tables(cims=True): # Load some library files. print "create_tables() called." fasta_filename = '/scratch/indexes/WS235.fa' sequences = dict((p.name.split(' ')[0], p.seq) for p in HTSeq.FastaReader(fasta_filename)) gtf_df = pandas.read_csv('../clip/lib/gtf_with_biotype_column.txt', sep='\t') if cims: globstr = 'cims_out/*' cits_option = False fdr = 0.001 table_dir = 'cims_tables/' else: create_cits_tables(sequences, gtf_df) return for filename in glob.glob(globstr): # Apply filter also renames columns. print "create_tables(): %s" % filename print "Loading peaks..." peaks = pandas.read_csv(filename, sep='\t') #peaks = peaks.head() rename_columns(peaks) peaks = apply_filter(peaks, cits=cits_option, fdr=fdr) assign_cims_cits_to_gene.assign_table(peaks, gtf_df=gtf_df, given_gtf=True) annotate_peaks_with_gene_type.annotate_peaks_with_gene_type( peaks, gtf_filename='../clip/lib/gtf_with_biotype_column.txt') peaks = peaks[peaks['biotype'] == 'protein_coding'] get_sequences_for_table(peaks, sequences, expand=10) write_fasta(peaks, 'fasta/' + os.path.basename(filename)) peaks.sort('height', ascending=0, inplace=True) write_peaks_table(peaks, filename, tables_folder=table_dir) print "Finished processing..."
def read_uniprot_to_dic(in_file, mode="full"): """ Reads a uniprot database and stores the identifier and sequence in a dic Paramters: --------------------- fasta_db: str, file location for the fasta database mode: str, Either "full" or "seq". "seq" only stores the sequence while "full" stores the whole sequence object including name, description etc. Returns: ------------------------- db_dic: dict, <key:value> with <uniprot_id>: Sequence """ fasta = HTSeq.FastaReader(in_file) uniprot_dic = {} if mode == "seq": for seq in fasta: uniprot_dic[get_uniprot(seq.name)] = seq.seq elif mode == "full": for seq in fasta: uniprot_dic[get_uniprot(seq.name)] = seq else: print "Error! Unsupported mode: %s" % mode return(uniprot_dic)
def getFASTAarray(FASTAfile, genomeArray): g_fp = HTSeq.FastaReader(FASTAfile) countContigs=0 for contig in g_fp: countContigs+=1 genomeArray[str(countContigs)]=contig.seq return genomeArray
def add_minus_three_c_column(peaks): if 'seq' not in peaks.columns: fasta_filename = '/scratch/indexes/WS235.fa' sequences = dict( (p.name.split(' ')[0], p.seq) for p in HTSeq.FastaReader(fasta_filename)) get_sequences_for_table(peaks, sequences, expand=10) peaks['minus_three_c'] = 0 peaks['minus_four_c'] = 0 peaks['tgt'] = 0 peaks['has_fbe'] = 0 peaks['seq'] = [x.lower() for x in peaks['seq'].tolist()] for index, row in peaks.iterrows(): if re.search('tgt\w\w\wat', peaks.loc[index, 'seq']) is not None: peaks.loc[index, 'has_fbe'] = 1 else: peaks.loc[index, 'has_fbe'] = 0 if re.search('c\w\wtgt\w\w\wat', peaks.loc[index, 'seq']) is not None: peaks.loc[index, 'minus_three_c'] = 1 else: peaks.loc[index, 'minus_three_c'] = 0 if re.search('c\w\w\wtgt\w\w\wat', peaks.loc[index, 'seq']) is not None: peaks.loc[index, 'minus_four_c'] = 1 else: peaks.loc[index, 'minus_four_c'] = 0 if re.search('tgt', peaks.loc[index, 'seq']) is not None: peaks.loc[index, 'tgt'] = 1 else: peaks.loc[index, 'tgt'] = 0 if re.search('ctgt\w\w\wat', peaks.loc[index, 'seq']) is not None: peaks.loc[index, 'minus_one_c'] = 1 else: peaks.loc[index, 'minus_one_c'] = 0 if re.search('c\wtgt\w\w\wat', peaks.loc[index, 'seq']) is not None: peaks.loc[index, 'minus_two_c'] = 1 else: peaks.loc[index, 'minus_two_c'] = 0 return peaks
def read_fasta_chrom(fasta_path, chrom): ss = '' for s in ht.FastaReader(fasta_path): if s.name == chrom: ss = s return ss return ss
def concatAllQueries(queryPath, maxBP, maxalleles): try: fg = open(os.path.join(queryPath,'Allalleles.fasta'),'w') fg.close() queryFilesOnDir = [ f for f in listdir(queryPath) if isfile(join(queryPath,f)) ] countAlleles = 0 ToWrite = [] for queryFile in queryFilesOnDir: if queryFile == 'Allalleles.fasta': ToWrite = [] continue ToWrite = [] g_fp = HTSeq.FastaReader(os.path.join(queryPath,queryFile)) count = 0 for allele in g_fp: if maxalleles != None and int(maxalleles) == count: break else: if maxBP != None: if len(str(allele.seq)) > maxBP: continue countAlleles += 1 ToWrite.append(">" + str(countAlleles) +"\n"+ str(allele.seq).upper() + "\n") count+=1 CreateNewAlleleFile(os.path.join(queryPath,'Allalleles.fasta'), ToWrite) except Exception: print 'An error occurred' return False return True
def CreateQueryDatabase(FASTAfile, databasePath,queryProteomeName): gene_fp = HTSeq.FastaReader(FASTAfile) names="" alleleProt='' proteome="" isEmpty = True countAlleles = 0 for allele in gene_fp: #new db for each allele to blast it against himself try: x = str(translateSeq(allele.seq)) countAlleles+=1 isEmpty = False except: print 'Could not translate' if countAlleles==0: isEmpty = True continue alleleProt+=">"+str(allele.name)+"\n"+x+"\n" proteome+=">"+str(allele.name)+"\n"+x+"\n" # with open(pathRef+'allAllelesAA.fasta', "wb") as f: # f.write(alleleProt) databasePath = os.path.join(databasePath,queryProteomeName) databasePath = databasePath.split('.')[0] databasePath = databasePath+'_db' with open(queryProteomeName, "wb") as v: v.write(proteome) Gene_Blast_DB_name = Create_Blastdb(queryProteomeName,1,True, databasePath) return databasePath, isEmpty
def read_fasta_substring(fasta_path, chrom, pos, end): ss = '' for s in ht.FastaReader(fasta_path): if s.name == chrom: ss = s return ss[pos:end] #short circuit return ss
def read_fasta(fasta_path, dictionary=False, trimN=False): ref = None if dictionary: ref = dict((s.name, s) for s in ht.FastaReader(fasta_path)) else: ss = [] for s in ht.FastaReader(fasta_path): ss += [s] ref = ss if trimN: if dictionary: for k in ref: ref[k].seq = ref[k].seq.replace('N', '') else: for i in range(0, len(ref)): ref[i].seq = ref[k].seq.replace('N', '') return ref
def fasta_to_dataframe(infile, idindex=0): """Get fasta proteins into dataframe""" keys = ['name', 'sequence', 'description'] fastafile = HTSeq.FastaReader(infile) data = [(s.name, s.seq.decode(), s.descr) for s in fastafile] df = pd.DataFrame(data, columns=(keys)) df.set_index(['name'], inplace=True) return df
def CanProVar_to_table(in_file, out_folder): """ Writes the CanProvar results from the fasta DB to a table and returns the indexed pandas dataframe """ fasta = HTSeq.FastaReader(in_file) ensemble_id = [] dbsnp_ids = [] FROM = [] TO = [] POS = [] ID = [] native_id = [] #iterate over the fasta file (CanProvar Format) #and get the mutations that are written to the description #create mutation tags and store them in a dataframe for seq in fasta: #multiple mutations are seperated by ; in the source file split = seq.descr.split(";") if split[0] != '': # for all mutations generate the specific tag # i.e. FROM, TO, POS = A,D, 20 # keep track of the identifier etc... for mutation in split: single_mut = mutation.split(":") pos = int(re.search("(\d+)", single_mut[1]).groups()[0]) muts = re.search("([A-Z*-]+)\d+([A-Z*-]+)", single_mut[1]).groups() POS.append(pos) FROM.append(muts[0]) TO.append(muts[1]) ensemble_id.append(seq.name) dbsnp_ids.append(single_mut[0]) native_id.append(single_mut[1]) ID.append("CanProVar") # convert lists to dataframe canprovar_df = pd.DataFrame() canprovar_df["FROM"] = FROM canprovar_df["TO"] = TO canprovar_df["POS"] = POS canprovar_df["ID"] = ID canprovar_df["native_id"] = native_id canprovar_df["ensemble_id"] = ensemble_id canprovar_df["dbsnp_ids"] = dbsnp_ids # extract only the ensemble_ids ensemble_ids = pd.DataFrame() ensemble_ids["ids"] = np.unique(canprovar_df["ensemble_id"]) ensemble_ids.to_csv(out_folder + "canprovar_ensemble_ids.csv", sep="\t") #index dataframe to be adressed by the ensemble id via #canprovar_df.loc["ENSP00000370532"] canprovar_df.to_csv(out_folder + "canprovar_tab.csv", sep="\t") canprovar_df = canprovar_df.set_index("ensemble_id") return (canprovar_df)
def fasta_to_dict(fasta_filename): ## parse the result of fastq_to_unique_fasta.py and ## return a dictionary with the amount of reads ## for each sequence tag n_seqtags = defaultdict(int) for s in HTSeq.FastaReader(fasta_filename): n_seqtags[s.seq] = int(s.name.split('_')[-1].replace('x', '')) return n_seqtags
def chromosome_names_and_lengths_from_fasta(fasta_fname): sequences = dict( (s[1], s[0]) for s in HTSeq.FastaReader(fasta_fname, raw_iterator=True)) sequences = {chrom_name: len(seq) for chrom_name, seq in sequences.items()} with open( PurePath(os.path.dirname(fasta_fname), os.path.basename(fasta_fname) + '.chrom_lengths'), 'w') as f: f.write('\n'.join([f'{k}\t{v}' for k, v in sequences.items()]))
def filter_fasta(infile): fastafile = HTSeq.FastaReader(infile) sequences = [(s.name, s.seq, s.descr) for s in fastafile] out = open('filtered.fa', "w") for s in sequences: if s[1] == 'Sequence unavailable': continue myseq = HTSeq.Sequence(s[1], s[0]) myseq.write_to_fasta_file(out) return
def get_sequences(combined): fasta_filename = 'lib/c_elegans.WS235.genomic.fa' sequences = dict((re.sub('CHROMOSOME_', '', p.name), p.seq) for p in HTSeq.FastaReader(fasta_filename)) for index, peak_row in combined.iterrows(): start = combined.loc[index, 'left'] end = combined.loc[index, 'right'] chrm = combined.loc[index, 'chrm'] seq = sequences[chrm][start:end] if combined.loc[index, 'strand'] == '-': seq = rc(seq) combined.loc[index, 'seq'] = seq
def spliting_referance(referance, destination): reads = HTSeq.FastaReader(AMPLICON_FASTA) list_of_amplicon = [] for read in reads: fasta_file_name = os.path.join(destination, read.name.strip() + ".fa") list_of_amplicon.append(fasta_file_name) with open(fasta_file_name, "w") as f: read.write_to_fasta_file(f) bb_list_of_amplicon = ",".join(list_of_amplicon) return bb_list_of_amplicon
def openDNA(filename): extension=os.path.splitext(filename)[1] if extension in ['.fna','.fasta','.ffn','.faa','.frn']: print('File '+filename+' is type FastA.') file=HTSeq.FastaReader(filename) num_lines=sum(1 for line in open(filename)) elif extension in ['.fq','.fastq']: print('File '+filename+' is type FastQ.') file=HTSeq.FastqReader(filename) num_lines=int(sum(1 for line in open(filename))/4) #1/4 of lines are sequencesy in fastQ else: raise Exception('Unknown file type, exiting.') return file, num_lines
def collapse_reads(infile, outfile=None, min_length=15): """Collapse identical reads, writing collapsed reads to a new fasta file. Retains copy number in fasta headers. Each sequence in the resulting file should be unique. Args: infile: input fastq file outfile: output fasta file with collapsed reads min_length: minimum length of read to include Returns: True if successful, otherwise False """ #from itertools import islice if outfile == None: outfile = os.path.splitext(infile)[0] + '_collapsed.fa' print('collapsing reads %s' % infile) ext = os.path.splitext(infile)[1] if ext == '.fastq': fastfile = HTSeq.FastqReader(infile, "solexa") elif ext == '.fa' or ext == '.fasta': fastfile = HTSeq.FastaReader(infile) else: print('not fasta or fastq') return False i = 0 total = 0 f = {} #print (fastfile) for s in fastfile: seq = s.seq.decode() if seq in f: f[seq]['reads'] += 1 else: f[seq] = {'name': s.name, 'reads': 1} total += 1 df = pd.DataFrame.from_dict(f, orient='index') df.index.name = 'seq' df = df.reset_index() l = df.seq.str.len() df = df[l >= min_length] df = df.drop(['name'], 1) df = df.sort_values(by='reads', ascending=False).reset_index() df['read_id'] = df.index.copy() df['read_id'] = df.apply(lambda x: str(x.read_id) + '_' + str(x.reads), 1) #print df[:10] utils.dataframe_to_fasta(df, idkey='read_id', outfile=outfile) #df.to_csv(os.path.splitext(outfile)[0]+'.csv', index=False) print('collapsed %s reads to %s' % (total, len(df))) return True
def readdna(filename): """ Reads in the dna sequence of the given fasta @type filename: string @param filename: Fasta-file used as input. @rtype: HTSeq Sequence object @return: Reference Fasta. """ chr = HTSeq.FastaReader(filename) for fasta in chr: referenz = HTSeq.Sequence(fasta.seq, fasta.name) return (referenz)
def adjust_peak_width(input_folder, table_dir='cims_alt_tables/'): fasta_filename = '/scratch/indexes/WS235.fa' if not os.path.exists(table_dir): os.system('mkdir ' + table_dir) sequences = dict((p.name.split(' ')[0], p.seq) for p in HTSeq.FastaReader(fasta_filename)) for filename in glob.glob(input_folder + '/*'): peaks = pandas.read_csv(filename, sep='\t') print peaks.head() get_sequences_for_table(peaks, sequences, expand=10, max_width=False) write_fasta(peaks, 'fasta/' + os.path.basename(filename)) peaks.sort('height', ascending=0, inplace=True) write_peaks_table(peaks, filename, tables_folder=table_dir)
def build_fa_blob(target, source, env): d = {} target = str(target[0]) source = str(source[0]) with open(source, "r") as f: stream = HTSeq.FastaReader(f) for entry in stream: d[entry.name] = entry.seq with open(target, "wb") as f: pickle.dump(d, f)
def readSequences(self, args): for fastaFile in args.fasta: if not fileExists(fastaFile): raise PSToolException("Fasta file does not exist: " + str(fastaFile)) self.fasta = {} for fastaFile in args.fasta: for seq in HTSeq.FastaReader(fastaFile): self.fasta[seq.name] = MinimalSeq(seq.seq, seq.name, seq.descr)
def returnSequence(fasta): """ Returns a sequence string from a fasta file. @type fasta: string @param fasta: path to fasta file. @rtype: string @return: sequence """ fastafile = HTSeq.FastaReader(fasta) for sequence in fastafile: return (sequence.seq)
def get_sequences(combined): #fasta_filename = '/home/dp/Desktop/celegans_genome/wormbase_ws235/c_elegans.WS235.genomic.fa' fasta_filename = 'lib/c_elegans.WS235.genomic.fa' sequences = dict((re.sub('CHROMOSOME_', '', p.name), p.seq) for p in HTSeq.FastaReader(fasta_filename)) for index, peak_row in combined.iterrows(): start = combined.loc[index, 'left'] end = combined.loc[index, 'right'] chrm = combined.loc[index, 'chrm'] seq = sequences[chrm][start:end] #print "%s:%i-%i: seq %s" % (chrm, start, end, seq) if combined.loc[index, 'strand'] == '-': seq = rc(seq) combined.loc[index, 'seq'] = seq
def generate_seq_stats(seqfile, header, table=None, fastqfile=True): ''' This function creates the JSON-files table.j, hist.j, edges.j, which are the basis for the sequence statistics table and graph visualized in the Sequence distribution-tab. If no table object is provided, headers are created and a table object is returned with two columns, headers and values. If a table object is provided, the function will add a new column to the table table: existing table (for adding a column) seqfile: path to sequencefile (fasta/fastq) header: name of column fastqfile: the function assumes a fastq file. "False" will accept fasta ''' if not table: table = { 'Statistic': [ 'Count (#)', 'Length (bp)', 'Over 100 bp', 'Over 500 bp', 'Over 1000 bp', 'Over 5000 bp', 'Over 10000 bp', 'Largest (bp)', 'Smallest (bp)', 'Average length (bp)', 'Median (bp)', 'N50' ] } # Parse sequencefile if fastqfile: seqlengths = [ len(s[0]) for s in HTSeq.FastqReader(seqfile, raw_iterator=True) ] else: seqlengths = [ len(s[0]) for s in HTSeq.FastaReader(seqfile, raw_iterator=True) ] # Calculate statistcs table[header] = [] table[header].append(len(seqlengths)) table[header].append(sum(seqlengths)) table[header].append(len([x for x in seqlengths if x > 100])) table[header].append(len([x for x in seqlengths if x > 500])) table[header].append(len([x for x in seqlengths if x > 1000])) table[header].append(len([x for x in seqlengths if x > 5000])) table[header].append(len([x for x in seqlengths if x > 10000])) table[header].append(max(seqlengths)) table[header].append(min(seqlengths)) table[header].append(np.mean(seqlengths)) table[header].append(calculate_n50(seqlengths)) # Create historgram data hist, edges = np.histogram(seqlengths, density=False, bins=int(max(seqlengths) / 10)) return (table, hist.tolist(), edges.tolist())
def readFile(filename, fileType): """ :rtype : return type is DNA Sequence as a list of characters """ fasta_file = "" #dummy initialization if (fileType == FASTA): fasta_file = HTSeq.FastaReader(filename) elif (fileType == FASTQ): fasta_file = HTSeq.FastqReader(filename) sequence = "" for read in fasta_file: sequence += read.seq return (map(lambda x: x.upper(), list(sequence)))
def extract_exons(fasta_fname, gff_fname): sequences = HTSeq.FastaReader(fasta_fname) # end_included=True as (exon.end - exon.start) % 3 = 2. gff = HTSeq.GFF_Reader(gff_fname, end_included=True) features = defaultdict(lambda: defaultdict(list)) for feat in gff: features[feat.name][feat.type].append(feat) for kog, feats in features.items(): exons = feats['Exon'] exons = sorted(exons, key=lambda e: e.iv.start) seq = ''.join([str(sequences[exon.iv]) for exon in exons]) binf.write_fasta_seq(sys.stdout, kog, seq)