def create_res_df_from_bam(input_file, reference): species_list, chr_length_list, read_count_list, basecount_list, gc_ref_list, gc_reads_list = [], [], [], [], [], [] for seq_record in SeqIO.parse(reference, 'fasta'): # joining all reads joined_reads = ''.join([ read.query_sequence for read in pysam.AlignmentFile(input_file, 'rb').fetch( contig=seq_record.name) ]) # appending to all Lists species_list.append(seq_record.name) chr_length_list.append(len(seq_record.seq)) read_count_list.append( pysam.AlignmentFile(input_file, 'rb').count(contig=seq_record.name)) gc_ref_list.append(SeqUtils.GC(seq_record.seq)) gc_reads_list.append(SeqUtils.GC(joined_reads)) basecount_list.append(sum([len(joined_reads)])) # create and return dataframe return pd.DataFrame( data={ 'species': species_list, 'chr_length': chr_length_list, 'gc_ref': gc_ref_list, 'gc_reads': gc_reads_list, 'read_count': read_count_list, 'basecount': basecount_list, })
def add_alignment(aln, result): result["n_taxa"] = aln.ntax result["n_sites"] = aln.nchar result["n_datablocks"] = len(aln.charpartitions["loci"]) # get GC skew aln_seq = ''.join([str(x) for x in aln.matrix.values()]) gc = SeqUtils.GC(aln_seq) # ACGT proportions A = float(aln_seq.count('A')) T = float(aln_seq.count('T')) G = float(aln_seq.count('G')) C = float(aln_seq.count('C')) sum_count = A + T + G + C # gaps gaps = float(aln_seq.count('?') + aln_seq.count('-') + aln_seq.count('N')) gap_proportion = gaps / float(len(aln_seq)) result["gc_proportion"] = gc result["gap_proportion"] = gap_proportion result["a_proportion"] = A / sum_count result["c_proportion"] = C / sum_count result["g_proportion"] = G / sum_count result["t_proportion"] = T / sum_count return result
def generate_wide_table(all_fastas): global args, output_handle basis = [['A', 'T', 'G', 'C']] * args.kmer_length all_kmers = sorted(["".join(x) for x in tuple(itertools.product(*basis))]) records_to_kmer = {} for f in all_fastas: logger.debug("Processing file %s" % (f)) for record in SeqIO.parse(f, "fasta", generic_dna): logger.debug("Processing sequence %s" % (record.description)) seq = str(record.seq) fasta_keys = [f, record.description, str(len(seq))] # Add additional features to the sequence fasta_keys.append(str(SeqUtils.GC(record.seq))) # fasta_keys = tuple(fasta_keys) records_to_kmer[fasta_keys] = collections.defaultdict(int) for i in range(0, len(seq) - args.kmer_length): kmer = seq[i:i + args.kmer_length] records_to_kmer[fasta_keys][kmer] += 1 if not args.append: print >> output_handle, "\t".join( ["path", "sequence_description", "sequence_length", "GC"] + all_kmers) for k, kmer_values in records_to_kmer.items(): all_values = list(k) all_values.extend(map(str, [kmer_values.get(x, 0) for x in all_kmers])) # print len(all_values) print >> output_handle, "\t".join(all_values)
def extract_gc_content(fasta, window, step): fdata = SeqIO.parse(fasta, "fasta") for record in fdata: sequence = record.seq for i in range(0, len(sequence), step): subseq = sequence[i:i + window] yield SeqUtils.GC(subseq)
def get_gen_stats(gbk_list): # NOTE: for now, the coding density do not take overlapping genes # into account. Depending on how many of them are present in a genome, # this may cause an overestimation of the coding density, as each # CDS will be accounted for separately (and a same region will be counted # several times). hsh_gen_stats = {} for gbk_file in gbk_list: ttl_length = 0 gc_cum = 0 cds_length = 0 for record in SeqIO.parse(gbk_file, "genbank"): ttl_length += len(record) gc_cum += SeqUtils.GC(record.seq) * len(record) for fet in record.features: if fet.type in ["CDS", "tmRNA", "rRNA", "ncRNA", "tRNA"]: if "pseudo" in fet.qualifiers: continue location = fet.location # allow to take compoundlocation into account for part in location.parts: cds_length += part.end - part.start gbk_shortened = gbk_file.replace(".gbk", "") hsh_gen_stats[gbk_shortened] = (float(gc_cum) / ttl_length, float(cds_length) / ttl_length, ttl_length) return hsh_gen_stats
def gene_feature(Y): """ Things like the sequence of the gene, the DNA Tm of the gene, etc. """ gene_names = Y["Target gene"] gene_length = np.zeros((gene_names.values.shape[0], 1)) gc_content = np.zeros((gene_names.shape[0], 1)) temperature = np.zeros((gene_names.shape[0], 1)) molecular_weight = np.zeros((gene_names.shape[0], 1)) for gene in gene_names.unique(): seq = util.get_gene_sequence(gene) gene_length[gene_names.values == gene] = len(seq) gc_content[gene_names.values == gene] = SeqUtil.GC(seq) temperature[gene_names.values == gene] = Tm.Tm_staluc(seq, rna=False) molecular_weight[gene_names.values == gene] = SeqUtil.molecular_weight( seq, "DNA") everything = np.concatenate( (gene_length, gc_content, temperature, molecular_weight), axis=1) df = pd.DataFrame( data=everything, index=gene_names.index, columns=[ "gene length", "gene GC content", "gene temperature", "gene molecular weight", ], ) return df
def gdps(str_list): ret_list = [] i = 50 for strng in str_list: ret_list.append((i, SeqUtils.GC(strng))) i = i + 100 return ret_list
def generate_long_table(all_fastas): global args, output_handle if not args.append: print >> output_handle, "\t".join([ "path", "sequence_description", "sequence_length", "GC", "kmer", "count" ]) for f in all_fastas: logger.debug("Processing file %s" % (f)) for record in SeqIO.parse(f, "fasta", generic_dna): kmer_count = collections.defaultdict(int) logger.debug("Processing sequence %s" % (record.description)) seq = str(record.seq) fasta_keys = [f, record.description, str(len(seq))] # Add additional features to the sequence fasta_keys.append(str(SeqUtils.GC(record.seq))) # # fasta_keys=tuple(fasta_keys) # kmer_count[fasta_keys]=collections.defaultdict(int) for i in range(0, len(seq) - args.kmer_length): kmer = seq[i:i + args.kmer_length] if args.star: kmer = list(kmer) for i in range(2, args.kmer_length, 3): kmer[i] = "*" kmer = "".join(kmer) kmer_count[kmer] += 1 for kmer, count in kmer_count.items(): print >> output_handle, "\t".join(fasta_keys + [kmer, str(count)])
def __init__(self, file, fastaRecord): super(SequenceStat, self).__init__() self.file = file self.length = len(fastaRecord.seq) self.description = fastaRecord.description self.gc = SeqUtils.GC(fastaRecord.seq) self.crc32 = CheckSum.crc32(fastaRecord.seq)
def main(): args = fetch_args() utility.add_tmp_dir(args) utility.check_input(args) print("\n## Computing mean contig GC content") contigs = {} for id, seq in utility.parse_fasta(args["fna"]): contig = Contig() contig.id = id contig.seq = str(seq) contig.gc = round(SeqUtils.GC(seq), 2) contigs[id] = contig mean = np.mean([c.gc for c in contigs.values()]) print("\n## Computing per-contig deviation from mean") for contig in contigs.values(): contig.values = {} contig.values["delta"] = abs(contig.gc - mean) print("\n## Identifying outlier contigs") flagged = [] for contig in contigs.values(): if contig.values["delta"] > args["cutoff"]: flagged.append(contig.id) out = f"{args['tmp_dir']}/flagged_contigs" print(f" {len(flagged)} flagged contigs: {out}") with open(out, "w") as f: for contig in flagged: f.write(contig + "\n")
def gene_feature(Y, X, learn_options): ''' Things like the sequence of the gene, the DNA Tm of the gene, etc. ''' gene_names = Y['Target gene'] gene_length = np.zeros((gene_names.values.shape[0], 1)) gc_content = np.zeros((gene_names.shape[0], 1)) temperature = np.zeros((gene_names.shape[0], 1)) molecular_weight = np.zeros((gene_names.shape[0], 1)) for gene in gene_names.unique(): seq = util.get_gene_sequence(gene) gene_length[gene_names.values == gene] = len(seq) gc_content[gene_names.values == gene] = SeqUtil.GC(seq) temperature[gene_names.values == gene] = Tm.Tm_NN(seq, rna=False) molecular_weight[gene_names.values == gene] = SeqUtil.molecular_weight( seq, 'DNA') all = np.concatenate( (gene_length, gc_content, temperature, molecular_weight), axis=1) df = pandas.DataFrame(data=all, index=gene_names.index, columns=[ 'gene length', 'gene GC content', 'gene temperature', 'gene molecular weight' ]) return df
def plot(unmappeddict, unmap_stats, out): """ Generates boxplot, distribution plots and join plots from the missing regions summary statistics. They are saved in the output directory as jpg images. Parameters ---------- unmappeddict: dict Dictionary of the coordinates and sequences of the unmapped regions unmap_stats: dataframe Table containing the unmapped regions summary statistics out: str Output directory """ gc_content = list() regions_length = list() for key, values in unmappeddict.items(): gc_content.append(SeqUtils.GC(values)) regions_length.append(len(values)) plt.figure(figsize=(10, 10)) sns.set(style='white', font_scale=2) fig_joint = sns.jointplot(regions_length, gc_content, kind='hex', height=7) fig_joint.set_axis_labels(xlabel='Length', ylabel='GC Content') fig_joint.savefig(os.path.join(out, 'gc_length_joint_missing.jpg')) plt.clf() plt.figure(figsize=(15, 10)) sns.set(style='white', font_scale=1.3) fig_gc = sns.distplot(gc_content, hist=True, rug=False, color='red') fig_gc.set(xlabel='GC Content') fig_gc.set_title('Distribution of GC Content') sns.despine() save = fig_gc.get_figure() save.savefig(os.path.join(out, 'gc_content_missing.jpg')) plt.clf() plt.figure(figsize=(15, 10)) sns.set(style='white', font_scale=1.3) fig_length = sns.distplot(regions_length, hist=True, rug=False, color='green') fig_length.set(xlabel='Length') fig_length.set_title('Distribution of Length') sns.despine() save = fig_length.get_figure() save.savefig(os.path.join(out, 'length_missing.jpg')) plt.clf() plt.figure(figsize=(15, 10)) sns.set(style='white', font_scale=1.2) ax = sns.boxplot(data=unmap_stats.iloc[:, 3:24], palette='Spectral') ax.set_xlabel('Translated Codons') ax.set_ylabel('Mean Percentage per Frame (%)') sns.despine() save = ax.get_figure() save.savefig(os.path.join(out, 'codons_missing.jpg')) plt.clf()
def get_Seq_ORF_features(file_path,input_file,model): seq_id = [] features_dict = {} transcript_sequences = [] for record in SeqIO.parse(input_file, "fasta"): name = record.id name = name.lower() seq_id.append(name) seq = record.seq transcript_sequences.append(seq) features_dict[name] = {} features_dict[name]["length"] = len(record.seq) G_C = SeqUtils.GC(record.seq) features_dict[name]["G+C"] = G_C insta_fe,PI_fe,gra_fe = PP.param(seq) Len,Cov,inte_fe = leng.len_cov(seq) features_dict[name].update({"ORF-integrity":inte_fe,"ORF-coverage":Cov,"Instability":insta_fe,"PI":PI_fe,"Gravy":gra_fe}) A,T,G,C,AT,AG,AC,TG,TC,GC,A0,A1,A2,A3,A4,T0,T1,T2,T3,T4,G0,G1,G2,G3,G4,C0,C1,C2,C3,C4 = CTD(seq) features_dict[name].update({'A':A,'T':T,'G':G,'C':C,'AT':AT,'AG':AG,'AC':AC,'TG':TG,'TC':TC,'GC':GC,'A0':A0,'A1':A1,'A2':A2,'A3':A3,'A4':A4,'T0':T0,'T1':T1,'T2':T2,'T3':T3,'T4':T4,'G0':G0,'G1':G1,'G2':G2,'G3':G3,'G4':G4,'C0':C0,'C1':C1,'C2':C2,'C3':C3,'C4':C4}) os.system("python3 "+file_path+"/feamodule/cpat.py -g "+input_file+" -o temp_cpat.txt -x "+model_reference[model][1]) #Use cpat to get fickett , hexamer , ORF with open("temp_cpat.txt.dat", "r") as tabular: cpat_reader = csv.reader(tabular, delimiter=("\t")) for row in cpat_reader: name = row[0] name = name.lower() ORF = float(row[2]) fickett = float(row[3]) hexamer = float(row[4]) features_dict[name]["ORF"] = ORF features_dict[name]["fickett"] = fickett features_dict[name]["hexamer"] = hexamer os.system("rm temp_cpat.txt.dat") return features_dict,seq_id,transcript_sequences
def refstats(reference, mappedlocations, unmappedlocations, conflictlocations, reverselocations, unmappeddict): """Generates summary statistics for reference genome based on the mapped, unmapped and conflict regions Parameters ---------- reference: str The file location of the reference FASTA file mappedlocations: dataframe Coordinates of the mapped regions in the reference sequence unmappedlocations: dataframe Coordinates of the unmapped regions in the reference sequence conflictlocations: dataframe Coordinates of the conflict regions in the reference sequence reverselocations: dataframe Coordinates of the mapped reverse complement regions in the reference sequence unmappeddict: dataframe Dictionary of the coordinates and sequences of the unmapped regions Returns ------- refstats_t: dataframe Table containing the reference summary statistics """ # Calculate genome fraction sum_map = 0 for i in range(0, mappedlocations.shape[0]): sum_map = sum_map + abs(mappedlocations.iloc[i,1] - mappedlocations.iloc[i,0]) sum_confl = 0 for i in range(0, conflictlocations.shape[0]): sum_confl = sum_confl + abs(conflictlocations.iloc[i,1] - conflictlocations.iloc[i,0]) sum_rev = 0 for i in range(0, reverselocations.shape[0]): sum_rev = sum_rev + abs(reverselocations.iloc[i,1] - reverselocations.iloc[i,0]) total_map = sum_map + sum_confl + sum_rev sum_unmap = 0 for i in range(0, unmappedlocations.shape[0]): sum_unmap = sum_unmap + abs(unmappedlocations.iloc[i,1] - unmappedlocations.iloc[i,0]) read = SeqIO.read(reference, format = 'fasta') refstats_dict = dict() refstats_dict = [{'GCContent': SeqUtils.GC(read.seq), 'Length': len(str(read.seq)), 'NumberMappedRegions': mappedlocations.shape[0] + reverselocations.shape[0] + conflictlocations.shape[0], 'NumberUnmappedRegions': unmappedlocations.shape[0], 'FilteredUnmappedRegions': len(unmappeddict), 'FractionMapped': (total_map/len(str(read.seq)))*100, 'FractionUnmapped': (sum_unmap/len(str(read.seq)))*100}] # Create reference summary dataframe refstats_t = pd.DataFrame.from_dict(refstats_dict) refstats_t.reset_index(drop = True, inplace = True) refstats_t.sort_index(inplace = True) return refstats_t
def get_stats_from_contigs(contigs_fasta): """ Use BioPython parser and GC calculator to get contig lengths and GCs from contigs fasta """ # initialize lists contigs = [] lengths = [] gcs = [] # loop over fasta records (this is 2-3 times faster than SeqIO.parse) # (and only marginally slower than my custom built parser.) with open(contigs_fasta, 'r') as CF: for title, sequence in SeqIO.FastaIO.SimpleFastaParser(CF): # parse title with RegEx contig = title.split(None, 1)[0] length = len(sequence) contigs.append(contig) lengths.append(length) gcs.append(SeqUtils.GC(sequence)) # convert to DataFrame and return return pandas.DataFrame({'contig': contigs, 'length': lengths, 'GC': gcs}).set_index('contig')
def candidates_for_seq(seq, descriptor, GC_requirement=[0, 100]): candidates = [] i = 0 while i < len(seq): nextPAM = seq[i:].find(PAM_SEQ) if nextPAM == -1 or (i + nextPAM + len(PAM_SEQ) + SPACER_LENGTH) > len(seq): i += 10000000 break targetSeq = seq[i + nextPAM + len(PAM_SEQ):i + nextPAM + len(PAM_SEQ) + SPACER_LENGTH] GC_content = SeqUtils.GC(targetSeq) if GC_content < GC_requirement[0] or GC_content > GC_requirement[1]: i += nextPAM + 1 continue name = descriptor + str(i + nextPAM + len(PAM_SEQ)) target = SeqRecord(targetSeq, id=name, name=name, description=name) candidate = { 'name': target.id, 'seqrec': target, 'location': i + nextPAM + len(PAM_SEQ) } candidates.append(candidate) i += nextPAM + 1 return candidates
def my_own_filtering(input_file, output_file, filt_gc=45, filt_arom=0.01): sequences = {} c = 0 with open(input_file, "r") as content: for record in SeqIO.parse(content, "fasta"): c += 1 # calculate GC content using Bio calc_gc = SeqUtils.GC(record.seq) # calculate aromaticity using Bio prot_seq = record.seq.translate() X = ProteinAnalysis(str(prot_seq)) calc_arom = X.aromaticity() # so, now you can filter if calc_gc >= filt_gc and calc_arom >= filt_arom: sequences[record.id] = record.se # write a new fasta file with aminoacids records = [] for seq_id, seq in sequences.items(): records.append(SeqRecord(seq.translate(), id=seq_id, description="")) write_file = open('my_fasta', 'w') SeqIO.write(records, write_file, 'fasta') write_file.close() # print the percentage print(len(records) / c)
def createDataFromRepeatMasker(diretorio): path = os.path.abspath(diretorio) for arquivo_encode in os.listdir(path): if padrao.fnmatch( arquivo_encode, "Repeat_Encode*"): #Se arquivo tratado é do Repeat Masker arquivo = open(path + "/" + arquivo_encode, 'r') for linha in arquivo: #ler cada linha do arquivo isLine = tem_LINE_L1( linha ) #busca por padrao LINE/L1 em cada linha, retorna TRUE/FALSE if isLine: #Se tiver LINE linha = formata_RepeatMasker( linha) #formata linha do LINE identificado it = linha.split( '\t' ) #cria lista onde cada coluna da linha é um elemento definido taxon = it[ 4] #coluna 5 da linha corresponde ao taxon(Aotus,Pan e etc) if taxon in taxons: #O taxon lido está presente na lista encodeID = get_encodeIDFromRepeatMasker(arquivo_encode) posicao_inicial = int( it[5]) #posicial inicial seq de um LINE posicao_final = int( it[6]) #posicao final da seq de um LINE tamanho_LINE = posicao_final - posicao_inicial #tamanho de um LINE if tamanho_LINE >= 5000: #filtro de tamanho em pares de base taxon_encodeID = get_taxon_encodeID( taxon, encodeID) #chave de busca no taxon_encodeID_db if taxon_encodeID_db.__contains__( taxon_encodeID ): #taxon_encodeID está no banco? sequencia = get_Seq_From_TaxonEncodeIDdb( taxon_encodeID) sequencia_LINE = get_seq_LINE( sequencia, posicao_inicial, posicao_final) cabecalho_LINE = format_cabecalho_LINE( taxon, encodeID, posicao_inicial, posicao_final) alvo_fasta_LINE = format_fasta_LINE( cabecalho_LINE, sequencia_LINE) escreverArquivoFasta(cabecalho_LINE, alvo_fasta_LINE) taxonStat[cabecalho_LINE] = calculate.GC( alvo_fasta_LINE) updateFreqTaxonEncode(taxon, encodeID) arquivo.close()
def checkgc(self, seq): res = True for it in self.items: if re.match(r'^gc$', it[0], re.I): res = self._compare(SeqUtils.GC(seq.seq), it[1], it[2]) if not res: break return res
def calculate_gc(self): """Calculates the GC percent in sequence. :return: Float number - GC percent. """ if self.s_type == 'PROTEIN': raise TypeError('GC are not in {} sequence'.format(self.s_type)) return SeqUtils.GC(self.seq)
def gc_content(asms_paths): asms_gc = {} for asm, path in asms_paths.items(): asm_gc = [] for record in SeqIO.parse(path, 'fasta'): asm_gc.append(SeqUtils.GC(record.seq) / 100) asms_gc[asm] = asm_gc return asms_gc
def genome_seq(genome_path): genome=open(genome_path, 'r') for record in SeqIO.parse(genome, "fasta"): genome_sequence=str(record.seq) genome.close() print('Whole genome average GC: ' + str(SeqUtils.GC(genome_sequence))) print('Whole genome length: ' + str(len(genome_sequence))) return genome_sequence
def gccontent(self): """@return float, GC content of sequence in %""" r = 0.0 try: r = SeqUtils.GC(self.sequence) except: pass return round(r, 0)
def calc_thermals(self): if not self.has_thermals: thals_obj = ThermoAnalysis() # Initialize a primer3 thal object self.tm = thals_obj.calcTm(self.sequence) self.hairpin_tm = thals_obj.calcHairpin(self.sequence).tm self.homodimer_tm = thals_obj.calcHomodimer(self.sequence).tm self.length = len(self.sequence) self.gc_content = SeqUtils.GC(self.sequence) self.nN = self.sequence.count("N")
def compute_stats(seq): stats = SeqStats stats.length = len(seq) stats.gc = SeqUtils.GC(seq) try: stats.weight = SeqUtils.molecular_weight(seq) except ValueError: stats.weight = None return stats
def target_genes_stats( genes=[ 'HPRT1', 'TADA1', 'NF2', 'TADA2B', 'NF1', 'CUL3', 'MED12', 'CCDC101' ]): for gene in genes: seq = get_gene_sequence(gene) if seq != None: print '%s \t\t\t\t len: %d \t GCcont: %.3f \t Temp: %.4f \t molweight: %.4f' % ( gene, len(seq), SeqUtil.GC(seq), Tm.Tm_staluc( seq, rna=False), SeqUtil.molecular_weight(seq, 'DNA'))
def get_gc(): for contig in db['genes'].find({'type': 'contig'}): gc_points = [] seq = contig['dna_seq'] for p in range(len(seq))[500::1000]: gc_cont = SeqUtils.GC(seq[p - 500:p + 499]) gc_points.append((p - 500, p + 499, gc_cont)) if gc_points: yield safe_species_name( contig['species']), contig['contig_id'], gc_points
def gc_contents(asms_paths): ''' Accepts path to multifasta, returns OrderedDict of sequence GC content ''' gc_contents = {} for asm_name, asm_path in asms_paths.items(): records = seqrecords(asm_path) gc_contents[asm_name] = [] for record in seqrecords: gc_contents[record.id].append(SeqUtils.GC(record.seq) / 100) return gc_contents
def get_gc_and_len_dict(fastafile): """Creates a dictionary with the fasta id as key and GC and length as keys for the inner dictionary.""" out_dict = {} for rec in SeqIO.parse(fastafile, "fasta"): out_dict[rec.id] = {} out_dict[rec.id]["length"] = len(rec.seq) out_dict[rec.id]["GC"] = SeqUtils.GC(rec.seq) return out_dict
def get_one_record_features(self, one_record): feature_list = [] for i in range(0,len(one_record.features)): #print one_record.features[i] if one_record.features[i].type == "misc_feature": continue new_feature = Feature() #print one_record.features[i] new_feature.type = one_record.features[i].type new_feature.contig = one_record.name new_feature.start = one_record.features[i].location.start new_feature.stop = one_record.features[i].location.end new_feature.length = len(one_record.features[i].location) new_feature.strand = one_record.features[i].strand try: new_feature.gene = one_record.features[i].qualifiers['gene'][0] except: pass try: gi_position=find_index("GO*",one_record.features[i].qualifiers['db_xref']) new_feature.gi = one_record.features[i].qualifiers['db_xref'][gi_position][3:] except: pass #geneID= one_record.features[i].qualifiers['db_xref'][1][7:] try: new_feature.locus = one_record.features[i].qualifiers['locus_tag'][0] except: pass try: new_feature.protein_id = one_record.features[i].qualifiers['protein_id'][0] except: pass try: new_feature.product = one_record.features[i].qualifiers['product'][0] except: pass try: new_feature.inference = one_record.features[i].qualifiers['inference'] except: pass try: new_feature.translation = one_record.features[i].qualifiers['translation'][0] except: pass new_feature.seq = one_record.features[i].extract(self.seq) new_feature.GC = SeqUtils.GC(new_feature.seq) feature_list.append(new_feature) return feature_list