Beispiel #1
0
def main():
    """
    in a directory with assemblies (ended in "*.fasta"),
    calculate the assembly statistics (number of contigs, total number of basepairs, max contig size, n50)
    for all contigs per assembly, including separate stats for the contigs with over 1000bp
    """
    try:
        assemblies = sorted(glob.glob(sys.argv[1] + '/*.fasta'))
    except IndexError as e:
        print(e, "Directory not found.")
        sys.exit(0)

    print(','.join([
        'Assembler', 'Contigs', 'basepairs', 'Max contig size', 'n50',
        'contigs>1000bp (%)', ' bp in contigs>1000bp (%)',
        'n50 in contigs>1000bp'
    ]))

    for assembly_file in assemblies:

        filename = utils.get_assember_name(assembly_file)
        contigs, contigs_over_1000bp = get_contig_lists(
            utils.fasta_iter(assembly_file))

        n50_contigs = utils.get_N50(contigs)
        n50_contigs_over_1000bp = utils.get_N50(contigs_over_1000bp)

        print(','.join([
            filename, f'{len(contigs)}', f'{sum(contigs)}', f'{n50_contigs}',
            f'{max(contigs)}',
            f'{len(contigs_over_1000bp)} ({(len(contigs_over_1000bp)/len(contigs))*100:.2f}%)',
            f'{sum(contigs_over_1000bp)} ({(sum(contigs_over_1000bp)/sum(contigs))*100:.2f}%)',
            f'{n50_contigs_over_1000bp}'
        ]))
Beispiel #2
0
def get_protein_counts(dir, hosts, strains, years):
    """Count seqs you have per protein"""

    counts = {}
    for host in hosts:
        for strain in strains:
            for year in years:
                key = '.'.join((host, strain, str(year)))
                f = os.path.join(dir, key + '.fa')
                if os.path.exists(f):
                    counts[key] = defaultdict(dict)
                    for ID, seq in utils.fasta_iter(f):
                        protein = ID.split('.')[-1]
                        counts[key][protein][ID] = True
    return counts
Beispiel #3
0
def main(sample_id, assembler, assembly, read_mapping_stats, min_len):

    contigs, contigs_over_min_len = get_contig_lists(
        utils.fasta_iter(assembly), min_len)

    n50_contigs = utils.get_Nx(contigs, 0.5)
    n50_contigs_over_min_len = utils.get_Nx(contigs_over_min_len, 0.5)

    # get read mapping stats
    with open(read_mapping_stats) as f:
        assembly_stats_json = json.load(f)
        if assembly_stats_json[sample_id]["assembler"] == assembler:
            mapped_reads = assembly_stats_json[sample_id]["mapped_reads"]
        else:
            logger.error(assembly_stats_json)

    with open("{}_{}_report.json".format(sample_id, assembler),
              "w") as json_report:
        json_dic = {
            "assembler": assembler,
            "sample_id": sample_id,
            "global": {
                "contigs": len(contigs),
                "basepairs": sum(contigs),
                "max_contig_size": max(contigs) if len(contigs) > 0 else 0,
                "N50": n50_contigs,
                "mapped_reads": mapped_reads
            },
            "filtered": {
                "min_len": min_len,
                "contigs": len(contigs_over_min_len),
                "basepairs": sum(contigs_over_min_len),
                "N50": n50_contigs_over_min_len
            }
        }

        json_report.write(json.dumps(json_dic, separators=(",", ":")))

    with open(
            sample_id + '_' + assembler + "_global_assembly_stats_global.csv",
            "w") as cvs_file:
        cvs_file.write(','.join([
            assembler, f'{len(contigs)}', f'{sum(contigs)}',
            f'{max(contigs)if len(contigs) > 0 else 0 }', f'{n50_contigs}',
            f'{len(contigs_over_min_len)}', f'{sum(contigs_over_min_len)}',
            f'{n50_contigs_over_min_len}'
        ]))
Beispiel #4
0
def save_unmapped_contigs(df, assembly_files):
    """
    For each assembly, saves all unmapped contigs in separate fasta files
    :param df: dataframe with assembly info
    :param assembly_files: list of assembly fasta files
    :return:
    """
    for assembler in sorted(df['Assembler'].unique()):

        fasta = utils.fasta_iter(
            fnmatch.filter(assembly_files, '*_' + assembler + '.*')[0])
        unmapped_contigs = list(df['Contig'][(df['Mapped'] == 'Unmapped')
                                             & (df['Assembler'] == assembler)])
        with open('unmapped_' + assembler + '.fasta', 'w') as fh:
            for header, seq in fasta:
                if header in unmapped_contigs:
                    fh.write(">" + header + "\n" + seq + "\n")
Beispiel #5
0
def get_use_strains(dir, hosts, strains, years):
    """Find proteins on strains with enough sequences."""

    use_strains = defaultdict(dict)
    use_files = {}
    for host in hosts:
        for strain in strains:
            for year in years:
                key = '.'.join((host, strain, str(year)))
                f = os.path.join(dir, key + '.fa')
                if os.path.exists(f):
                    counts = defaultdict(dict)
                    for ID, seq in utils.fasta_iter(f):
                        protein = ID.split('.')[-1]
                        counts[protein][ID] = True
                    for protein in counts:
                        if len(counts[protein]) > global_settings.SEQ_LIMIT:
                            cons_file = f.replace('.fa',
                                                  '.elms.conservation')
                            use_strains[protein][cons_file] = True
                            use_files[cons_file] = True
    return (use_strains, use_files)
Beispiel #6
0
            offset = 0
            while match:
                for elm in pattern2elm[elm_pattern]:
                    printResult(protein, elm, 
                                match, tempSeq, offset)
                tempSeq = tempSeq[int(match.start())+1:]
                offset += int( match.start() ) + 1
                match = p.search(tempSeq)

req_args = ['pattern file',
            'fasta file']
examples = ['../../Data/ELM/elm2pattern',
            '../../Data/FASTA/Human/hprd.intr.fasta']
utils_scripting.checkStart(sys.argv, req_args, examples, len(req_args), True)

input_pattern_file = sys.argv[1]
fasta_file = sys.argv[2]
pattern2regex = {}
pattern2elm = defaultdict(dict)
with open(input_pattern_file) as f:
    for line in f:
        elm, pattern = line.strip().split('\t')
        pattern2elm[pattern][elm] = True
for pattern in pattern2elm:
    pattern2regex[pattern] = re.compile(pattern)

for protein, seq in utils.fasta_iter(fasta_file):
    matchSeq(protein, seq, pattern2elm, pattern2regex)


Beispiel #7
0
# remove clusters with more than
# one sequence for a host
to_remove = {}
for cluster in clusters:
    for host in clusters[cluster]:
        if len(clusters[cluster][host]) > 1:
            to_remove[cluster] = True
            break
for rm in to_remove:
    del clusters[rm]

# make new fasta from
# resulting clusters
seqs = defaultdict(dict)
for cluster in clusters:
    for host in clusters[cluster]:
        seq_id = clusters[cluster][host].keys()[0]
        seqs[host][seq_id] = True

for host in hosts:
    fasta_itr = utils.fasta_iter(fasta_dir 
                                 + name2label[host] + '.fa',
                                 getID)
    with open(outdir + name2label[host] + '.fa', 'w') as outf:
        for ID, seq in fasta_itr:
            if ID in seqs[host]:
                outf.write('>' + ID + '\n')
                outf.write(seq + '\n')
print start_clusters, len(clusters)
                
Beispiel #8
0
#    strains = ('H9N2', 'H5N1')

input = 'working/input' + str(random.randint(0,100))
rfile = 'working/rfile' + str(random.randint(0,100))
outfile = 'working/Jul22/flu_seqs.png'
with open(input, 'w') as f:
    f.write('Host\tStrain\tYear\tProtein\tLogCount\n')
    for host in hosts:
        for year in years:
            for strain in strains:
                file = os.path.join(dir, 
                                    '.'.join((host, 
                                              strain, str(year))) + '.fa')
                if os.path.exists(file):
                    protein_counts = defaultdict(dict)
                    for ID, seq in utils.fasta_iter(file):
                        protein_class = ID.split('.')[-1]
                        protein_counts[protein_class][ID] = True
                    for protein in protein_counts:
                        count = len(protein_counts[protein])
                        if count > global_settings.SEQ_LIMIT:
                            val = math.log(count ,10)
                            f.write('%s\t%s\t%s\t%s\t%.10f\n' %
                                    (host, strain, str(year)[2:], 
                                     global_settings.FLU_PROTEINS[protein], 
                                     val))

with open(rfile, 'w') as f:
    f.write('library(ggplot2)\n')
    f.write("d<-read.delim('"
            + input + "', header=TRUE, sep='\\t', as.is=TRUE)\n")
Beispiel #9
0
for rm in to_remove:
    del clusters[rm]

# make new fasta file by
# sampling one species
# from each cluster
outdir = sys.argv[3]
sampled = {}
for host in hosts:
    sampled[host] = {}

for cluster in clusters:
    for host in clusters[cluster]:
        if len(clusters[cluster][host]) > 1:
            sample = random.sample(clusters[cluster][host], 1)[0]
        else:
            sample = clusters[cluster][host].keys()[0]
        sampled[host][sample] = True
# for host in sampled:
#    print host, len(sampled[host])

# writed sampled seq_id fasta
for host in hosts:
    fasta_itr = utils.fasta_iter(fasta_dir + name2label[host] + ".fa", getID)
    with open(outdir + name2label[host] + ".fa", "w") as outf:
        for ID, seq in fasta_itr:
            if ID in sampled[host]:
                outf.write(">" + ID + "\n")
                outf.write(seq + "\n")
print start_clusters, len(clusters)
    for line in f:
        (cluster, host, seq_id) = line.strip().split('\t')
        if host in name2label:
            if cluster not in clusters:
                clusters[cluster] = {}
            if host not in clusters[cluster]:
                clusters[cluster][host] = {}
            clusters[cluster][host][seq_id] = True
            hosts[host] = True
start_clusters = len(clusters)

# load fasta file
# and count seq lengths
for host in hosts:
    afile = fasta_dir + name2label[host] + '.fa'
    for ID, seq in utils.fasta_iter(afile, getID_local):
        seqs_w_fasta[host][ID] = len(seq)
# remove seq_ids from roundup that
# are not in fasta
for cluster in clusters:
    for host in clusters[cluster]:
        to_remove = {}
        for seq_id in clusters[cluster][host]:
            if seq_id not in seqs_w_fasta[host]:
                to_remove[seq_id] = True
        for rm in to_remove:
            del clusters[cluster][host][rm]
# remove hosts with no seq_ids
for cluster in clusters:
    to_remove = {}
    for host in clusters[cluster]:
Beispiel #11
0
""" Create a table of FASTA sequences.
    ID -> sequence
"""
import utils, sys, random, os

fasta_file = sys.argv[1]
table_name = sys.argv[2]

max_len = 0
tmp = "tmp" + str(random.randint(0, 100))
with open(tmp, "w") as f:
    for protein, seq in utils.fasta_iter(fasta_file, lambda line: line.split("|")[1]):
        f.write("%s\t%s\n" % (protein, seq))
        max_len = max([len(seq), max_len])

(conn, cur) = utils.init_mysql("fasta")
line = "CREATE TABLE " + table_name + " ( seq_id CHAR(100), seq TEXT(" + str(max_len + 100) + ") )"
cur.execute(line)
line = (
    "LOAD DATA LOCAL INFILE '"
    + tmp
    + "' INTO TABLE "
    + table_name
    + " FIELDS TERMINATED BY '\\t' LINES TERMINATED BY '\\n'"
)
cur.execute(line)
conn.commit()
cur.close()
conn.close()

os.system("rm " + tmp)