def main(): """ in a directory with assemblies (ended in "*.fasta"), calculate the assembly statistics (number of contigs, total number of basepairs, max contig size, n50) for all contigs per assembly, including separate stats for the contigs with over 1000bp """ try: assemblies = sorted(glob.glob(sys.argv[1] + '/*.fasta')) except IndexError as e: print(e, "Directory not found.") sys.exit(0) print(','.join([ 'Assembler', 'Contigs', 'basepairs', 'Max contig size', 'n50', 'contigs>1000bp (%)', ' bp in contigs>1000bp (%)', 'n50 in contigs>1000bp' ])) for assembly_file in assemblies: filename = utils.get_assember_name(assembly_file) contigs, contigs_over_1000bp = get_contig_lists( utils.fasta_iter(assembly_file)) n50_contigs = utils.get_N50(contigs) n50_contigs_over_1000bp = utils.get_N50(contigs_over_1000bp) print(','.join([ filename, f'{len(contigs)}', f'{sum(contigs)}', f'{n50_contigs}', f'{max(contigs)}', f'{len(contigs_over_1000bp)} ({(len(contigs_over_1000bp)/len(contigs))*100:.2f}%)', f'{sum(contigs_over_1000bp)} ({(sum(contigs_over_1000bp)/sum(contigs))*100:.2f}%)', f'{n50_contigs_over_1000bp}' ]))
def get_protein_counts(dir, hosts, strains, years): """Count seqs you have per protein""" counts = {} for host in hosts: for strain in strains: for year in years: key = '.'.join((host, strain, str(year))) f = os.path.join(dir, key + '.fa') if os.path.exists(f): counts[key] = defaultdict(dict) for ID, seq in utils.fasta_iter(f): protein = ID.split('.')[-1] counts[key][protein][ID] = True return counts
def main(sample_id, assembler, assembly, read_mapping_stats, min_len): contigs, contigs_over_min_len = get_contig_lists( utils.fasta_iter(assembly), min_len) n50_contigs = utils.get_Nx(contigs, 0.5) n50_contigs_over_min_len = utils.get_Nx(contigs_over_min_len, 0.5) # get read mapping stats with open(read_mapping_stats) as f: assembly_stats_json = json.load(f) if assembly_stats_json[sample_id]["assembler"] == assembler: mapped_reads = assembly_stats_json[sample_id]["mapped_reads"] else: logger.error(assembly_stats_json) with open("{}_{}_report.json".format(sample_id, assembler), "w") as json_report: json_dic = { "assembler": assembler, "sample_id": sample_id, "global": { "contigs": len(contigs), "basepairs": sum(contigs), "max_contig_size": max(contigs) if len(contigs) > 0 else 0, "N50": n50_contigs, "mapped_reads": mapped_reads }, "filtered": { "min_len": min_len, "contigs": len(contigs_over_min_len), "basepairs": sum(contigs_over_min_len), "N50": n50_contigs_over_min_len } } json_report.write(json.dumps(json_dic, separators=(",", ":"))) with open( sample_id + '_' + assembler + "_global_assembly_stats_global.csv", "w") as cvs_file: cvs_file.write(','.join([ assembler, f'{len(contigs)}', f'{sum(contigs)}', f'{max(contigs)if len(contigs) > 0 else 0 }', f'{n50_contigs}', f'{len(contigs_over_min_len)}', f'{sum(contigs_over_min_len)}', f'{n50_contigs_over_min_len}' ]))
def save_unmapped_contigs(df, assembly_files): """ For each assembly, saves all unmapped contigs in separate fasta files :param df: dataframe with assembly info :param assembly_files: list of assembly fasta files :return: """ for assembler in sorted(df['Assembler'].unique()): fasta = utils.fasta_iter( fnmatch.filter(assembly_files, '*_' + assembler + '.*')[0]) unmapped_contigs = list(df['Contig'][(df['Mapped'] == 'Unmapped') & (df['Assembler'] == assembler)]) with open('unmapped_' + assembler + '.fasta', 'w') as fh: for header, seq in fasta: if header in unmapped_contigs: fh.write(">" + header + "\n" + seq + "\n")
def get_use_strains(dir, hosts, strains, years): """Find proteins on strains with enough sequences.""" use_strains = defaultdict(dict) use_files = {} for host in hosts: for strain in strains: for year in years: key = '.'.join((host, strain, str(year))) f = os.path.join(dir, key + '.fa') if os.path.exists(f): counts = defaultdict(dict) for ID, seq in utils.fasta_iter(f): protein = ID.split('.')[-1] counts[protein][ID] = True for protein in counts: if len(counts[protein]) > global_settings.SEQ_LIMIT: cons_file = f.replace('.fa', '.elms.conservation') use_strains[protein][cons_file] = True use_files[cons_file] = True return (use_strains, use_files)
offset = 0 while match: for elm in pattern2elm[elm_pattern]: printResult(protein, elm, match, tempSeq, offset) tempSeq = tempSeq[int(match.start())+1:] offset += int( match.start() ) + 1 match = p.search(tempSeq) req_args = ['pattern file', 'fasta file'] examples = ['../../Data/ELM/elm2pattern', '../../Data/FASTA/Human/hprd.intr.fasta'] utils_scripting.checkStart(sys.argv, req_args, examples, len(req_args), True) input_pattern_file = sys.argv[1] fasta_file = sys.argv[2] pattern2regex = {} pattern2elm = defaultdict(dict) with open(input_pattern_file) as f: for line in f: elm, pattern = line.strip().split('\t') pattern2elm[pattern][elm] = True for pattern in pattern2elm: pattern2regex[pattern] = re.compile(pattern) for protein, seq in utils.fasta_iter(fasta_file): matchSeq(protein, seq, pattern2elm, pattern2regex)
# remove clusters with more than # one sequence for a host to_remove = {} for cluster in clusters: for host in clusters[cluster]: if len(clusters[cluster][host]) > 1: to_remove[cluster] = True break for rm in to_remove: del clusters[rm] # make new fasta from # resulting clusters seqs = defaultdict(dict) for cluster in clusters: for host in clusters[cluster]: seq_id = clusters[cluster][host].keys()[0] seqs[host][seq_id] = True for host in hosts: fasta_itr = utils.fasta_iter(fasta_dir + name2label[host] + '.fa', getID) with open(outdir + name2label[host] + '.fa', 'w') as outf: for ID, seq in fasta_itr: if ID in seqs[host]: outf.write('>' + ID + '\n') outf.write(seq + '\n') print start_clusters, len(clusters)
# strains = ('H9N2', 'H5N1') input = 'working/input' + str(random.randint(0,100)) rfile = 'working/rfile' + str(random.randint(0,100)) outfile = 'working/Jul22/flu_seqs.png' with open(input, 'w') as f: f.write('Host\tStrain\tYear\tProtein\tLogCount\n') for host in hosts: for year in years: for strain in strains: file = os.path.join(dir, '.'.join((host, strain, str(year))) + '.fa') if os.path.exists(file): protein_counts = defaultdict(dict) for ID, seq in utils.fasta_iter(file): protein_class = ID.split('.')[-1] protein_counts[protein_class][ID] = True for protein in protein_counts: count = len(protein_counts[protein]) if count > global_settings.SEQ_LIMIT: val = math.log(count ,10) f.write('%s\t%s\t%s\t%s\t%.10f\n' % (host, strain, str(year)[2:], global_settings.FLU_PROTEINS[protein], val)) with open(rfile, 'w') as f: f.write('library(ggplot2)\n') f.write("d<-read.delim('" + input + "', header=TRUE, sep='\\t', as.is=TRUE)\n")
for rm in to_remove: del clusters[rm] # make new fasta file by # sampling one species # from each cluster outdir = sys.argv[3] sampled = {} for host in hosts: sampled[host] = {} for cluster in clusters: for host in clusters[cluster]: if len(clusters[cluster][host]) > 1: sample = random.sample(clusters[cluster][host], 1)[0] else: sample = clusters[cluster][host].keys()[0] sampled[host][sample] = True # for host in sampled: # print host, len(sampled[host]) # writed sampled seq_id fasta for host in hosts: fasta_itr = utils.fasta_iter(fasta_dir + name2label[host] + ".fa", getID) with open(outdir + name2label[host] + ".fa", "w") as outf: for ID, seq in fasta_itr: if ID in sampled[host]: outf.write(">" + ID + "\n") outf.write(seq + "\n") print start_clusters, len(clusters)
for line in f: (cluster, host, seq_id) = line.strip().split('\t') if host in name2label: if cluster not in clusters: clusters[cluster] = {} if host not in clusters[cluster]: clusters[cluster][host] = {} clusters[cluster][host][seq_id] = True hosts[host] = True start_clusters = len(clusters) # load fasta file # and count seq lengths for host in hosts: afile = fasta_dir + name2label[host] + '.fa' for ID, seq in utils.fasta_iter(afile, getID_local): seqs_w_fasta[host][ID] = len(seq) # remove seq_ids from roundup that # are not in fasta for cluster in clusters: for host in clusters[cluster]: to_remove = {} for seq_id in clusters[cluster][host]: if seq_id not in seqs_w_fasta[host]: to_remove[seq_id] = True for rm in to_remove: del clusters[cluster][host][rm] # remove hosts with no seq_ids for cluster in clusters: to_remove = {} for host in clusters[cluster]:
""" Create a table of FASTA sequences. ID -> sequence """ import utils, sys, random, os fasta_file = sys.argv[1] table_name = sys.argv[2] max_len = 0 tmp = "tmp" + str(random.randint(0, 100)) with open(tmp, "w") as f: for protein, seq in utils.fasta_iter(fasta_file, lambda line: line.split("|")[1]): f.write("%s\t%s\n" % (protein, seq)) max_len = max([len(seq), max_len]) (conn, cur) = utils.init_mysql("fasta") line = "CREATE TABLE " + table_name + " ( seq_id CHAR(100), seq TEXT(" + str(max_len + 100) + ") )" cur.execute(line) line = ( "LOAD DATA LOCAL INFILE '" + tmp + "' INTO TABLE " + table_name + " FIELDS TERMINATED BY '\\t' LINES TERMINATED BY '\\n'" ) cur.execute(line) conn.commit() cur.close() conn.close() os.system("rm " + tmp)