Exemple #1
0
 def __readAssembly(self, assembly) :
     seq = {}
     with uopen(assembly) as fin :
         header = fin.read(1)
     with uopen(assembly) as fin :
         if header == '@' :
             for id, line in enumerate(fin) :
                 if id % 4 == 0 :
                     part = line[1:].strip().split()
                     name = part[0]
                     seq[name]= [0, float(part[2]) if len(part) > 2 else 0., None, None]
                 elif id % 4 == 1 :
                     seq[name][2] = line.strip()
                     seq[name][0] = len(seq[name][2])
                 elif id % 4 == 3 :
                     seq[name][3] = np.array(list(line.strip()))
             fasfile = assembly.rsplit('.', 1)[0] + '.fasta'
             logger('Write fasta sequences into {0}'.format(fasfile))
             with open(fasfile, 'w') as fout :
                 for n, s in sorted(seq.items()) :
                     fout.write('>{0}\n{1}\n'.format(n, '\n'.join([ s[2][site:(site+100)] for site in xrange(0, len(s[2]), 100)])))
         else :
             fasfile = assembly
             for id, line in enumerate(fin) :
                 if line.startswith('>') :
                     name = line[1:].strip().split()[0]
                     seq[name] = [0, 0., []]
                 else :
                     seq[name][2].extend( line.strip().split() )
             for n, s in seq.items() :
                 s[2] = ''.join(s[2])
                 s[0] = len(s[2])
     return seq, fasfile
Exemple #2
0
def readMap(data) :
    mTag, mFile = data
    presences, absences, mutations = [], [], []
    
    aligns = ['', 0, 0]
    miss = -1
    with uopen(mFile) as fin :
        for line in fin :
            if line.startswith('##') :
                if line.startswith('## Reference: ') :
                    ref = line.split(' ')[-1]
                elif line.startswith('## Query: ') :
                    qry = line.split(' ')[-1]
                    if ref == qry :
                        miss = -99999999
            else :
                break
    with uopen(mFile) as fin :
        for line in fin :
            if line.startswith('#') : continue
            part = line.strip().split('\t')
            part[3:5] = [int(part[3]), int(part[4])]
            if part[2] == 'misc_feature' :
                if len(presences) == 0 or presences[-1][0] != part[0] or presences[-1][2] < part[3] :
                    presences.append([part[0], part[3], part[4]])
                elif presences[-1][2] < part[4] :
                    presences[-1][2] = part[4]
            elif part[2] == 'unsure' :
                absences.append([part[0], part[3], part[4], miss])
            elif part[2] == 'variation' :
                alt = re.findall(r'replace="([^"]+)"', part[8])
                ori = re.findall(r'origin="([^"]+)"', part[8])
                if len(alt) and len(ori) :
                    mutations.append([mTag, part[0], part[3], ori[0], alt[0]])
    return presences, absences, mutations
Exemple #3
0
def readXFasta(fasta_file, core=0.8):
    seqs = []
    nameMap = {}
    contLens, missingLens = [], []
    with uopen(fasta_file) as fin, open(fasta_file + '.tmp', 'w') as fout:
        for line in fin:
            if line.startswith('>'):
                name = line[1:].strip().split()[0]
                seqName = name.split(':', 1)[0]
                contName = name.split(':', 1)[-1]
                if seqName in nameMap:
                    seqId = nameMap[seqName]
                    seqs[seqId] = [seqName, contName, []]
                else:
                    seqId = nameMap[seqName] = len(seqs)
                    seqs.append([seqName, contName, []])
            elif line.startswith('='):
                for seq in seqs:
                    seq[2] = ''.join(seq[2]).upper()
                contName = [seq[1] for seq in seqs if seq[2] != ''][0]
                contLen = [len(seq[2]) for seq in seqs if seq[2] != ''][0]
                for seq in seqs:
                    if seq[2] == '':
                        seq[2] = '-' * contLen
                cLen, mLen = parseSNP(fout, seqs, core)
                contLens.append(cLen)
                missingLens.extend(mLen)
                seqs = [[n, '', []] for n in nameMap]
            else:
                seqs[seqId][2].extend(line.strip().split())
        if len(seqs[0][2]) > 0:
            cLen, mLen = parseSNP(fout, seqs, core)
            contLens.append(cLen)
            missingLens.extend(mLen)
    sys.stdout.write('## Constant_bases: {0} {1} {2} {3}\n'.format(
        conservedSites[ord('A')], conservedSites[ord('C')],
        conservedSites[ord('G')], conservedSites[ord('T')]))
    for cLen in contLens:
        sys.stdout.write('## Sequence_length: {0} {1}\n'.format(*cLen))
    for mLen in missingLens:
        sys.stdout.write('## Missing_region: {0} {1} {2}\n'.format(*mLen))
    sys.stdout.write('#Seq\t#Site\t{0}\n'.format('\t'.join(
        [n for n, i in sorted(nameMap.items(), key=lambda n: n[1])])))
    with uopen(fasta_file + '.tmp') as fin:
        for line in fin:
            sys.stdout.write(line)
    os.unlink(fasta_file + '.tmp')
    return
Exemple #4
0
def MLSTdb(args):
    params = getParams(args)
    database, refset, alleleFasta, refstrain, max_iden, min_iden, coverage, paralog, relaxEnd = params[
        'database'], params['refset'], params['alleleFasta'], params[
            'refstrain'], params['max_iden'], params['min_iden'], params[
                'coverage'], params['paralog'], params['relaxEnd']
    if os.path.isfile(alleleFasta):
        alleles = readFasta(uopen(alleleFasta))
    else:
        alleles = readFasta(StringIO(alleleFasta))
    alleles = [allele for allele in alleles \
                   if allele['value_id'].isdigit() and int(allele['value_id']) > 0 and allele['fieldname'].find('/') < 0]
    refAlleles = ''
    if refset is not None:
        if refstrain:
            if os.path.isfile(refstrain):
                references = readFasta(uopen(refstrain))
            else:
                references = readFasta(StringIO(refstrain))
        else:
            loci, references = {}, []
            for allele in alleles:
                if allele['fieldname'] not in loci:
                    loci[allele['fieldname']] = 1
                    references.append(allele)

        allele_text, refAlleles = buildReference(alleles, references, max_iden,
                                                 min_iden, coverage, paralog,
                                                 relaxEnd)
        if refset:
            with open(str(refset), 'w') as fout:
                fout.write(refAlleles + '\n')
        logger('A file of reference alleles has been generated:  {0}'.format(
            refset))
    if database:
        conversion = [[], []]
        with open(database, 'w') as fout:
            for allele in alleles:
                conversion[0].append(get_md5(allele['value']))
                conversion[1].append(
                    [allele['fieldname'],
                     int(allele['value_id'])])

        conversion = pd.DataFrame(conversion[1], index=conversion[0])
        conversion.to_csv(database, header=False)
        logger('A lookup table of all alleles has been generated:  {0}'.format(
            database))
    return allele_text, refAlleles
Exemple #5
0
def xFasta2Matrix(prefix, fasta_file, core=0.95):
    seqs = []
    snp_data = []
    nameMap = {}
    with uopen(fasta_file) as fin:
        for line in fin:
            if line.startswith('>'):
                name = line[1:].strip().split()[0]
                seqName = name.split(':', 1)[0]
                contName = name.split(':', 1)[-1]
                if seqName in nameMap:
                    seqId = nameMap[seqName]
                    seqs[seqId] = [seqName, contName, []]
                else:
                    seqId = nameMap[seqName] = len(seqs)
                    seqs.append([seqName, contName, []])
            elif line.startswith('='):
                if fillMissingSeq(seqs, len(snp_data)):
                    snp_data.append(
                        parse_snps(prefix, len(snp_data), seqs, core))
                seqs = [[n]
                        for n, i in sorted(nameMap.items(), key=lambda x: x[1])
                        ]
            else:
                seqs[seqId][2].extend(line.strip().split())
    if fillMissingSeq(seqs, len(snp_data)):
        snp_data.append(parse_snps(prefix, len(snp_data), seqs, core))

    const_sites = np.sum([snp[2] for snp in snp_data], axis=0)
    names = [n for n, i in sorted(nameMap.items(), key=lambda x: x[1])]
    with uopen(prefix + '.matrix.gz', 'w') as fout:
        fout.write('## Constant_bases: ' + ' '.join(const_sites.astype(str)) +
                   '\n')
        for snp in snp_data:
            fout.write('## Sequence_length: {0} {1}\n'.format(*snp[:2]))
        for snp in snp_data:
            for s, e in snp[3]:
                fout.write('## Missing_region: {0} {1} {2}\n'.format(
                    snp[0], s, e))
        fout.write('#seq\t#site\t' + '\t'.join(names) + '\n')
        for snp in snp_data:
            d = np.load(snp[4])
            sites, sv = d['sites'], d['snps']
            for s in sites:
                fout.write('{0}\t{1}\t{2}\n'.format(
                    snp[0], s[0], '\t'.join(sv[s[1]].astype(str))))
            os.unlink(snp[4])
    return prefix + '.matrix.gz'
Exemple #6
0
def phylo(args) :
    args = add_args(args)
    global pool
    pool = Pool(args.n_proc)
    
    if 'matrix' in args.tasks :
        assert os.path.isfile( args.alignment )
        seq = readXFasta( args.alignment )
        names, sites, snps, seqLens, missing = parse_snps(seq, args.core)
        args.snp = write_matrix(args.prefix+'.matrix.gz', names, sites, snps, seqLens, missing)
        if len(names) < 4 :
            raise ValueError('Taxa too few.')
        snp_list = sorted([[info[0], int(math.ceil(info[2])), list(line), info[1]] for line, info in snps.items() ])
    elif 'phylogeny' in args.tasks or 'ancestral' in args.tasks or 'ancestral_proportion' in args.tasks :
        assert os.path.isfile( args.snp )
        names, sites, snps, seqLens, missing = read_matrix(args.snp)
        if len(names) < 4 :
            raise ValueError('Taxa too few.')
        snp_list = sorted([[info[0], int(math.ceil(info[2])), list(line), info[1]] for line, info in snps.items() ])


    # build tree
    if 'phylogeny' in args.tasks :
        phy, weights, asc = write_phylip(args.prefix+'.tre', names, snp_list)
        if phy != '' :
            args.tree = run_raxml(args.prefix +'.tre', phy, weights, asc, 'CAT', args.n_proc)
        else :
            args.tree = args.prefix + '.tre'
            with open(args.tree, 'w') as fout :
                fout.write('({0}:0.0);'.format(':0.0,'.join(names)))
        args.tree = get_root(args.prefix, args.tree)
    elif 'ancestral' in args.tasks or 'ancestral_proportion' in args.tasks :
        tree = Tree(args.tree, format=1)

    # map snp
    if 'ancestral' in args.tasks :
        final_tree, node_names, states = infer_ancestral(args.tree, names, snp_list, sites, infer='viterbi')
        states = np.array(states)
        final_tree.write(format=1, outfile=args.prefix + '.labelled.nwk')
        write_states(args.prefix+'.ancestral_states.gz', node_names, states, sites, seqLens, missing)
    elif 'mutation' in args.tasks :
        final_tree = Tree(args.tree, format=1)
        node_names, states, sites = read_states(args.ancestral)

    if 'ancestral_proportion' in args.tasks :
        final_tree, node_names, states = infer_ancestral(args.tree, names, snp_list, sites, infer='margin')
        final_tree.write(format=1, outfile=args.prefix + '.labelled.nwk')
        write_ancestral_proportion(args.prefix+'.ancestral_proportion.gz', node_names, states, sites, seqLens, missing)

    if 'mutation' in args.tasks :
        mutations = get_mut(final_tree, node_names, states, sites)
        with uopen(args.prefix + '.mutations.gz', 'w') as fout :
            for sl in seqLens :
                fout.write('## Sequence_length: {0} {1}\n'.format(*sl))
            for ms in missing :
                fout.write('## Missing_region: {0} {1} {2}\n'.format(*ms))
            
            fout.write('#Node\t#Seq\t#Site\t#Homoplasy\t#Mutation\n')
            for mut in mutations :
                fout.write('\t'.join([str(m) for m in mut]) + '\n')
Exemple #7
0
def readXFasta(fasta_file) :
    seqs = [[]]
    
    nameMap = {}
    with uopen(fasta_file) as fin :
        for line in fin :
            if line.startswith('>') :
                name = line[1:].strip().split()[0]
                seqName = name.split(':', 1)[0]
                contName = name.split(':', 1)[-1]
                if seqName in nameMap :
                    seqId = nameMap[seqName]
                    seqs[-1][seqId] = [seqName, contName, []]
                else :
                    seqId = nameMap[seqName] = len(seqs[-1])
                    seqs[-1].append([seqName, contName, []])
            elif line.startswith('=') :
                seqs.append([[seqName, str(len(seqs)), []] for seqName, contName, _ in seqs[0] ])
            else :
                seqs[-1][seqId][2].extend(line.strip().split())
    res = []
    for blocks in seqs:
        seqLen = 0
        for block in blocks :
            block[2] = ''.join(block[2])
            seqLen = max(seqLen, len(block[2]))
        for block in blocks :
            if len(block[2]) < seqLen :
                block[2] += '-' * (seqLen - len(block[2]))
        if seqLen > 0 :
            res.append(blocks)
    return res
Exemple #8
0
    def write_down(filename, regions, repeats, mutations, reference, query, tag) :
        with uopen(filename, 'w') as fout:
            fout.write('##gff-version 3\n')
            fout.write('## Reference: {0}\n'.format(reference))
            fout.write('## Query: {0}\n'.format(query))
            fout.write('## Tag: {0}\n'.format(tag))

            fout.write('\n'.join(['{0}\trefMapper\tmisc_feature\t{1}\t{2}\t{3}\t{4}\t.\t{5}'.format(r[1], r[2], r[3], r[0], r[10], '/inference="Aligned%20with%20{0}:{1}-{2}"'.format(*r[7:10])) for r in regions]) + '\n')
            fout.write('\n'.join(['{0}\trefMapper\tunsure\t{1}\t{2}\t.\t+\t.\t/inference="{3}"'.format(r[0], r[1], r[2], 'Repetitive%20region' if r[3] == 0 else 'Uncertain%20base%20calling%20or%20ambigious%20alignment') for r in repeats]) + '\n')
            for contig, variation in sorted(mutations.items()):
                for site, alters in sorted(variation.items()) :
                    for alter, source in alters.items() :
                        if source[6][0] == '-' :
                            difference = '+{0}'.format(source[7])
                            origin = '.'
                        elif source[7][0] == '-' :
                            difference = '-{0}'.format(source[6])
                            origin = source[6]
                        else :
                            difference = source[7]
                            origin = source[6]
                        compare = ''
                        for id in xrange(0, len(source), 9) :
                            compare += '{0}:{1}-{2}:{3};'.format(source[id+0], abs(source[id+4]), abs(source[id+5]), source[id+1])
                        fout.write('{0}\trefMapper\tvariation\t{1}\t{2}\t.\t+\t.\t{3}\n'.format(contig, source[2], source[3], '/replace="{0}";/compare="{1}";/origin="{2}"'.format(difference, compare[:-1], origin)))
Exemple #9
0
def main(mgs):
    for mg in mgs:
        res = []
        with uopen(mg) as fin:
            for line in fin:
                logp = re.findall('logp:\t([-eE\d\.]+)', line)
                if len(logp):
                    logp = float(logp[0])
                    res.append([logp])
                else:
                    genotype = re.findall(
                        'Genotype (\d+):\tMean proportion:\t([eE\d\.]+)\tCI95%:\t(\[ [eE\d\.]+ - [eE\d\.]+ \])',
                        line)
                    if len(genotype):
                        res[-1].append(
                            [genotype[0][1], genotype[0][2], '', '', ''])
                    elif len(res) and len(
                            res[-1]) > 1 and res[-1][-1][-1] == '':
                        part = line.strip().split('\t')
                        res[-1][-1][2:] = [
                            part[0], part[1], part[3] + ' ' + part[5]
                        ]
        try:
            res = max(res)
            res[1:] = sorted(res[1:], key=lambda x: -float(x[0]))
            for i in xrange(1, len(res)):
                r = res[i]
                print('{0}\t{1}\t{2}'.format(mg, i, '\t'.join(r)))
        except:
            pass
Exemple #10
0
def loadMatrix(fname):
    sequences, snps = {}, {}
    with uopen(fname) as fin:
        for line in fin:
            if line.startswith('##'):
                part = line.strip().split()
                if line.startswith('## Sequence_length'):
                    sequences[part[2]] = np.zeros(int(part[3]), dtype=np.int8)
                elif line.startswith('## Missing_region'):
                    sequences[part[2]][int(part[3]) - 1:int(part[4])] = -1
            else:
                headers = line.strip().split('\t')
                break
        matrix = pd.read_csv(fin, header=None, sep='\t').values
        encode = {'A': 1, 'C': 2, 'G': 4, 'T': 8}
        matrix.T[4] = list(
            map(lambda d: encode.get(d[0], -9999) + encode.get(d[-1], -9999),
                matrix.T[4].tolist()))
        matrix = matrix[matrix.T[4] > 0]
        matrix.T[2] -= 1
        for m in matrix:
            sequences[m[1]][m[2]] |= m[4]
            if (m[1], m[2]) not in snps:
                snps[(m[1], m[2])] = {}
            if m[4] not in snps[(m[1], m[2])]:
                snps[(m[1], m[2])][m[4]] = []
            snps[(m[1], m[2])][m[4]].append(m[0])
    return sequences, snps
Exemple #11
0
def iter_readGFF(fname):
    seq, cds = {}, {}
    names = {}
    with uopen(fname) as fin:
        sequenceMode = False
        for line in fin:
            if line.startswith('#'):
                continue
            elif line.startswith('>'):
                sequenceMode = True
                name = line[1:].strip().split()[0]
                assert name not in seq, logger(
                    'Error: duplicated sequence name {0}'.format(name))
                seq[name] = [fname, []]
            elif sequenceMode:
                seq[name][1].extend(line.strip().split())
            else:
                part = line.strip().split('\t')
                if len(part) > 2:
                    name = re.findall(r'locus_tag=([^;]+)', part[8])
                    if len(name) == 0:
                        parent = re.findall(r'Parent=([^;]+)', part[8])
                        if len(parent) and parent[0] in names:
                            name = names[parent[0]]
                    if len(name) == 0:
                        name = re.findall(r'Name=([^;]+)', part[8])
                    if len(name) == 0:
                        name = re.findall(r'ID=([^;]+)', part[8])

                    if part[2] == 'CDS':
                        assert len(name) > 0, logger(
                            'Error: CDS has no name. {0}'.format(line))
                        #          source_file, seqName, Start,       End,      Direction, hash, Sequences
                        cds[name[0]] = [
                            fname, part[0],
                            int(part[3]),
                            int(part[4]), part[6], 0, ''
                        ]
                    else:
                        ids = re.findall(r'ID=([^;]+)', part[8])
                        if len(ids):
                            names[ids[0]] = name

    for n in seq:
        seq[n][1] = ''.join(seq[n][1]).upper()
    for n in cds:
        c = cds[n]
        try:
            c[6] = seq[c[1]][1][(c[2] - 1):c[3]]
            if c[4] == '-':
                c[6] = rc(c[6])
            if not checkCDS(n, c[6]):
                c[6] = ''
            else:
                c[5] = int(hashlib.sha1(c[6].encode('utf-8')).hexdigest(), 16)
        except:
            c[6] = ''

    return seq, cds
Exemple #12
0
def prepReference(prefix, ref_tag, reference, aligner, pilercr, trf, **args) :
    def mask_tandem(fasta_file) :
        cmd = '{0} {1} 2 4 7 80 10 60 2000 -d -h -ngs'.format(trf, fasta_file)
        trf_run = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, universal_newlines=True)
    
        region = []
        for line in iter(trf_run.stdout.readline, r'') :
            if line[0] == '@' :
                cont_name = line[1:].strip().split()[0]
            else :
                part = line.split(' ',2)[:2]
                region.append([cont_name, int(part[0])-2, int(part[1])+2])
        return region
    
    def mask_crispr(fasta_file, prefix) :
        cmd = '{0} -in {1} -out {2}.crispr'.format(pilercr, fasta_file, prefix)
        subprocess.Popen(cmd.split(), stderr=subprocess.PIPE).communicate()
        summary_trigger = 0
    
        region = []
        with open('{0}.crispr'.format(prefix)) as fin :
            for line in fin :
                if line.startswith('SUMMARY BY POSITION') :
                    summary_trigger = 1
                elif summary_trigger :
                    if line[0] == '>' :
                        cont_name = line[1:].strip().split()[0]
                    elif len(line) > 10 and line.strip()[0] in '0123456789' :
                        part = line[24:].strip().split()
                        region.append([cont_name, int(part[0]), int(part[0]) + int(part[1]) -1])
        os.unlink('{0}.crispr'.format(prefix))
        return region
    # prepare reference
    if reference :
        if not isinstance(aligner, list) :
            subprocess.Popen('{0} -k15 -w5 -d {2}.mmi {1}'.format(aligner, reference, prefix).split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
        else :
            subprocess.Popen('{0} -cR01 {2}.mmi {1}'.format(aligner[0], reference, prefix).split()).communicate()
        import tempfile
        with tempfile.NamedTemporaryFile(dir='.') as tf :
            seq, _ = readFastq(reference)
            tf_fas = '{0}.fasta'.format(tf.name)
            with open(tf_fas, 'wt') as fout:
                for n, s in seq.items() :
                    fout.write('>{0}\n{1}\n'.format(n, s))
            #tf_fas = '{0}.fasta'.format(tf.name)
            #if reference.upper().endswith('GZ') :
            #    subprocess.Popen('{0} -cd {1} > {2}'.format(externals['pigz'], reference, tf_fas), shell=True).communicate()
            #else :
            #    subprocess.Popen('cp {1} {2}'.format(externals['pigz'], reference, tf_fas), shell=True).communicate()
            repeats = mask_tandem(tf_fas) + mask_crispr(tf_fas, tf.name)
            os.unlink(tf_fas)
        alignments = alignAgainst([prefix +'.' + ref_tag.rsplit('.', 1)[0] + '.0', aligner, prefix + '.mmi', [ref_tag, reference], [ref_tag, reference]])
        with uopen(alignments[1], 'a') as fout :
            for r in repeats :
                fout.write('{0}\trefMapper\tunsure\t{1}\t{2}\t.\t+\t.\t/inference="repetitive_regions"\n'.format(
                    r[0], r[1], r[2], 
                ))
    return alignments
Exemple #13
0
def write_states(fname, names, states, sites, seqLens, missing) :
    with uopen(fname, 'w') as fout :
        for sl in seqLens :
            fout.write('## Sequence_length: {0} {1}\n'.format(*sl))
        for ms in missing :
            fout.write('## Missing_region: {0} {1} {2}\n'.format(*ms))
        fout.write('#Seq\t#Site\t' + '\t'.join(names) + '\n')
        for site in sites :
            fout.write('{0}\t{1}\t{2}\n'.format(site[0], site[1], '\t'.join(states[site[2]]) ))
Exemple #14
0
def read_matrix(fname) :
    sites, snps = [], {}
    invariant = []
    seqLens, missing = [], []
    with uopen(fname) as fin :
        for line_id, line in enumerate(fin) :
            if line.startswith('##'):
                if line.startswith('## Constant_bases') :
                    part = line[2:].strip().split()
                    invariant = zip(['A', 'C', 'G', 'T'], [float(v) for v in part[1:]])
                elif line.startswith('## Sequence_length:') :
                    part = line[2:].strip().split()
                    seqLens.append([part[1], int(part[2])])
                elif line.startswith('## Missing_region:') :
                    part = line[2:].strip().split()
                    missing.append([part[1], int(part[2]), int(part[3])])
            elif line.startswith('#') :
                part = np.array(line.strip().split('\t'))
                cols = np.where((1 - np.char.startswith(part, '#')).astype(bool))[0]
                w_cols = np.where(np.char.startswith(part, '#!W'))[0]
                names = part[cols]
                break
            else :
                part = np.array(line.strip().split('\t'))
                cols = np.ones(part.shape, dtype=bool)
                cols[:2] = False
                w_cols = np.char.startswith(part, '#!W')
                names = part[cols]
                break
        mat = pd.read_csv(fin, header=None, sep='\t', usecols=cols.tolist() + w_cols.tolist() + [0,1]).values
        
    val = {'A':'A', 'C':'C', 'G':'G', 'T':'T', '-':'-', 'N':'-', '.':'.'}
    validate = np.vectorize(lambda b: b if len(b) > 1 else val.get(b, '-'))
    
    for p2 in mat :
        part = validate(np.char.upper(p2[cols].astype(str)))
        b_key = tuple(part)
        w = np.multiply.reduce(p2[w_cols].astype(float)) if w_cols.size else 1.
        
        if b_key in snps :
            snps[b_key][2] += w
        else :
            types = dict(zip(*np.unique(part, return_index=True)))
            types.pop('-', None)
            snps[b_key] = [len(snps), len(types)-1, w]
            
        if snps[b_key][1] > 0 :
            sites.append([ p2[0], int(p2[1]), snps[b_key][0] ])

    for inv in invariant :
        b_key = tuple([inv[0]] * len(names))
        if b_key not in snps :
            snps[b_key] = [len(snps), 0, float(inv[1])]
        else :
            snps[b_key][2] += float(inv[1])
    return names, sites, snps, seqLens, missing
Exemple #15
0
def readRec(fname):
    rec = {}
    with uopen(fname) as fin:
        for line in fin:
            part = line.strip().split('\t')
            key = tuple([part[0], part[1]])
            if key not in rec:
                rec[key] = []
            rec[key].append([int(part[2]), int(part[3])])
    return rec
Exemple #16
0
def write_ancestral_proportion(fname, names, states, sites, seqLens, missing) :
    with uopen(fname, 'w') as fout :
        for sl in seqLens :
            fout.write('## Sequence_length: {0} {1}\n'.format(*sl))
        for ms in missing :
            fout.write('## Missing_region: {0} {1} {2}\n'.format(*ms))
        
        fout.write('#Seq\t#Site\t#Type:Proportion\n')
        for c, p, i in sites :
            tag, state = states[i]
            for n, ss in zip(names, state) :
                fout.write( '{0}\t{1}\t{2}\t{3}\n'.format(c, p, n, '\t'.join([ '{0}:{1:.5f}'.format(t, s) for t, s in zip(tag, ss)]) ))
Exemple #17
0
def RecHMM(args) :
    args = parse_arg(args)
    global pool, verbose
    pool = Pool(args.n_proc)
    verbose = not args.clean

    model = recHMM(prefix=args.prefix, mode=args.task)
    
    if not args.report or not args.model :
        sequences, missing = [], []
        with uopen(args.data) as fin :
            for line in fin :
                if line.startswith('##') :
                    if line.startswith('## Sequence_length:') :
                        part = line[2:].strip().split()
                        sequences.append([part[1], int(part[2])])
                    elif line.startswith('## Missing_region:') :
                        part = line[2:].strip().split()
                        missing.append([part[1], int(part[2]), int(part[3])])
                else :
                    break
            data = pd.read_csv(fin, sep='\t', dtype=str, header=None).values
        branches, mutations = {}, []
        seqLens = {seqName:[seqId, seqLen] for seqId, (seqName, seqLen) in enumerate(sequences)}
        for d in data :
            if re.findall(r'^[ACGTacgt]->[ACGTacgt]$', d[4]) :
                if d[1] not in seqLens :
                    seqLens[d[1]] = [len(seqLens), int(d[2])]
                if seqLens[d[1]][1] < int(d[2]) :
                    seqLens[d[1]][1] = int(d[2])
                if d[0] not in branches :
                    branches[d[0]] = len(branches)
                brId, seqId = branches[d[0]], seqLens[d[1]][0]
                mutations.append([brId, seqId, int(d[2]), int(d[3])])
        missing = np.array([ [seqLens.get(m[0], [-1])[0], m[1], m[2]] for m in missing ])
        sequences = [ [n, i[1]] for n, i in sorted(seqLens.items(), key=lambda x:x[1][0])]
        branches = np.array([ br for br, id in sorted(branches.items(), key=lambda x:x[1]) ])
        mutations = np.array(mutations)
        reorder = np.argsort(-np.bincount(mutations.T[0]))
        branches = branches[reorder]
        reorder = np.array([i1 for i1, i2 in sorted(enumerate(reorder), key=lambda x:x[1])])
        mutations.T[0] = reorder[mutations.T[0]]
    if args.model :
        model.load(open(args.model, 'r'))
    else :
        #pass
        model.fit(mutations, branches=branches, sequences=sequences, missing=missing, categories=args.categories, init=args.init, cool_down=args.cool_down)
        model.save(open(args.prefix + '.best.model.json', 'w'))
        print('Best HMM model is saved in {0}'.format(args.prefix + '.best.model.json'))
    model.report(args.bootstrap)

    if not args.report :
        model.predict(mutations, branches=branches, sequences=sequences, missing=missing, marginal=args.marginal, tree=args.tree)
Exemple #18
0
def readFasta(fasta):
    sequence = []
    with uopen(fasta) as fin:
        for line in fin:
            if line.startswith('>'):
                name = line[1:].strip().split()[0]
                sequence.append([name, []])
            elif len(line) > 0 and not line.startswith('#'):
                sequence[-1][1].extend(line.strip().split())
    for s in sequence:
        s[1] = (''.join(s[1])).upper()
    return sequence
Exemple #19
0
def write_filtered_matrix(fname, names, sites, snps, masks, m_weight):
    bases = {65: 0, 67: 0, 71: 0, 84: 0}
    for snp in snps:
        for n, c in zip(*np.unique(snp[2], return_counts=True)):
            if n in bases:
                bases[n] += c * snp[1]

    name_map = {name: id for id, name in enumerate(names)}
    sss = []
    for site in sites:
        if site[1] not in m_weight or not len(m_weight[site[1]]): continue
        weight = np.mean(list(m_weight[site[1]].values()))
        if site[3].size == 0:
            snvs = np.frompyfunc(chr, 1, 1)(snps[site[2]][2])
        else:
            x = snps[site[2]][2]
            if 45 in x:
                if '-' not in site[3]:
                    site[3] = np.concatenate([site[3], ['-']])
                x[x == 45] = np.where(site[3] == '-')[0][0]
            snvs = site[3][x]
        snv_x = []
        p = np.zeros(snvs.shape, dtype=bool)
        for m in masks.get(site[1], []):
            pp = np.ones(snvs.shape, dtype=bool)
            pp[[name_map[mm] for mm in m]] = False
            p = (p | (~pp))
            snv_x.append(np.copy(snvs))
            snv_x[-1][pp] = '-'
        snv_x.append(snvs)
        snv_x[-1][p] = '-'

        for snv in snv_x:
            snv_type, snv_cnt = np.unique(snv, return_counts=True)
            snv_type, snv_cnt = snv_type[
                ~np.in1d(snv_type, ['-', 'N', 'n']
                         )], snv_cnt[~np.in1d(snv_type, ['-', 'N', 'n'])]
            if snv_type.size > 1:
                for k, v in zip(snv_type, snv_cnt):
                    if k in bases:
                        bases[k] -= v * weight
                sss.append(['\t'.join(snv.tolist()), weight, site[:2]])

    with uopen(fname, 'w') as fout:
        fout.write('## Constant_bases: ' + ' '.join([
            str(int(inv[1] / names.size + 0.5) if inv[1] > 0 else 0.)
            for inv in sorted(bases.items())
        ]) + '\n')
        fout.write('#seq\t#site\t' + '\t'.join(names) + '\t#!W[RecFilter]\n')
        for snv, weight, site in sss:
            fout.write('{2}\t{3}\t{0}\t{1:.5f}\n'.format(snv, weight, *site))
    return fname
Exemple #20
0
def readFasta(fasta, filter=None) :
    sequence = {}
    with uopen(fasta) as fin :
        for line in fin :
            if line.startswith('>') :
                name = line[1:].strip().split()[0]
                if not filter or name in filter :
                    sequence[name] = []
            elif len(line) > 0 and not line.startswith('#') and name in sequence :
                sequence[name].extend(line.strip().split())
    for s in sequence :
        sequence[s] = (''.join(sequence[s])).upper()
    return sequence
Exemple #21
0
def read_states(fname) :
    names, ss, sites = [], {}, []
    with uopen(fname) as fin :
        names = fin.readline().strip().split('\t')[2:]
        for line in fin :
            seq, site, snp_str = line.strip().split('\t', 2)
            if snp_str not in ss :
                ss[snp_str] = len(ss)
            sites.append([seq, int(site), ss[snp_str]])
    states = []
    for s, id in sorted(ss.items(), key=lambda x:x[1]) :
        states.append(s.split('\t'))
    return names, np.array(states), sites
Exemple #22
0
def evaluate(profile, cluster, stepwise, ave_gene_length=1000.) :
    with uopen(profile) as fin :
        logger('Loading profiles ...')                
        profile_header = fin.readline().strip().split('\t')
        ST_col = np.where([p.find('#ST')>=0 for p in profile_header])[0].tolist()
        if len(ST_col) <= 0 :
            ST_col = [0]
        cols = ST_col + np.where([not h.startswith('#') for h in profile_header])[0].tolist()
        profile = pd.read_csv(fin, sep='\t', header=None, index_col=0, usecols=cols)
        profile_names = profile.index.values
        profile = profile.values
    
    with uopen(cluster) as fin :
        logger('Loading hierCC ...')                        
        cluster_header = fin.readline().strip().split('\t')
        cols = [0] + np.where([not h.startswith('#') for h in cluster_header])[0].tolist()
        cluster = pd.read_csv(fin, sep='\t', header=None, index_col=0, usecols=cols)
        cluster_names = cluster.index.values
        cluster = cluster.values
        s = np.arange(0, cluster.shape[1], stepwise)
        cluster = cluster[:, s]

    presence = np.in1d(cluster_names, profile_names)
    cluster, cluster_names = cluster[presence], cluster_names[presence]
    order = {n:id for id, n in enumerate(cluster_names)}
    profile_order = np.array([ [id, order[n]] for id, n in enumerate(profile_names) if n in order ])
    profile_order = profile_order[np.argsort(profile_order.T[1]), 0]
    profile_names = profile_names[profile_order]
    profile = profile[profile_order]
    
    shannon = shannon_index(cluster)

    similarity = get_similarity('adjusted_rand_score', cluster, stepwise)

    silhouette = get_silhouette(profile, cluster, stepwise, ave_gene_length)

    np.savez_compressed('evalHCC.npz', shannon=shannon, similarity=similarity, silhouette=silhouette)
    logger('Done. Results saved in evalHCC.npz')
Exemple #23
0
def readRecRegions(recFile, seqRange):
    recBlocks = []
    with uopen(recFile) as fin:
        for line in fin:
            part = line.strip().split()
            if part[0] == 'Importation':
                if part[2] not in seqRange: continue
                acc = seqRange[part[2]][0]
                recBlocks.append(
                    [part[1], acc + int(part[3]), acc + int(part[4])])
            elif len(part) == 3:
                try:
                    recBlocks.append([part[0], int(part[1]), int(part[2])])
                except:
                    pass
    return recBlocks
Exemple #24
0
def write_matrix(fname, names, sites, snps, seqNames, missing) :
    invariants = { snp[0]:[base, snp[2]] for base, snp in snps.items() if snp[1] == 0 and base[0] != '-' }
    bases = {}
    for inv in invariants.values() :
        bases[inv[0]] = bases.get(inv[0], 0) + inv[1]
    sv = {ss[0]:'\t'.join(s) for s, ss in snps.items()}
    with uopen(fname, 'w') as fout :
        fout.write('## Constant_bases: ' + ' '.join([str(inv[1]) for inv in sorted(bases.items())]) + '\n')
        for n, l in seqNames :
            fout.write('## Sequence_length: {0} {1}\n'.format(n, l))
        for n, s, e in missing :
            fout.write('## Missing_region: {0} {1} {2}\n'.format(n, s, e))
        fout.write('#seq\t#site\t' + '\t'.join(names) + '\n')
        for site in sites :
            fout.write('{1}\t{2}\t{0}\n'.format(sv[site[2]], *site[:2]))
    return fname
Exemple #25
0
def write_filtered_matrix(fname, names, sites, snps, masks, m_weight):
    bases = {'A': 0, 'C': 0, 'G': 0, 'T': 0}
    for base, snp in snps.items():
        for n, c in zip(*np.unique(base, return_counts=True)):
            if n in bases:
                bases[n] += c * snp[2]

    sv = {ss[0]: s for s, ss in snps.items()}
    name_map = {name: id for id, name in enumerate(names)}
    sss = []
    for site in sites:
        if not len(m_weight[site[1]]): continue
        weight = np.mean(list(m_weight[site[1]].values()))
        snvs = np.array(sv[site[2]])
        snv_x = []
        p = np.zeros(snvs.shape, dtype=bool)
        for m in masks.get(site[1], []):
            pp = np.ones(snvs.shape, dtype=bool)
            pp[[name_map[mm] for mm in m]] = False
            p = (p | (~pp))
            snv_x.append(np.copy(snvs))
            snv_x[-1][pp] = '-'
        snv_x.append(snvs)
        snv_x[-1][p] = '-'

        for snv in snv_x:
            snv_type, snv_cnt = np.unique(snv, return_counts=True)
            snv_type, snv_cnt = snv_type[
                ~np.in1d(snv_type, ['-', 'N', 'n']
                         )], snv_cnt[~np.in1d(snv_type, ['-', 'N', 'n'])]
            if snv_type.size > 1:
                for k, v in zip(snv_type, snv_cnt):
                    if k in bases:
                        bases[k] -= v * weight
                sss.append(['\t'.join(snv.tolist()), weight, site[:2]])

    with uopen(fname, 'w') as fout:
        fout.write('## Constant_bases: ' + ' '.join([
            str(int(inv[1] / names.size + 0.5) if inv[1] > 0 else 0.)
            for inv in sorted(bases.items())
        ]) + '\n')
        fout.write('#seq\t#site\t' + '\t'.join(names) + '\t#!W[RecFilter]\n')
        for snv, weight, site in sss:
            fout.write('{2}\t{3}\t{0}\t{1:.5f}\n'.format(snv, weight, *site))
    return fname
Exemple #26
0
def read_states(fname):
    names, ss, sites = [], {}, []
    with uopen(fname) as fin:
        for line in fin:
            if line.startswith('##'):
                continue
            else:
                names = line.strip().split('\t')[2:]
                break
        for line in fin:
            seq, site, snp_str = line.strip().split('\t', 2)
            if snp_str not in ss:
                ss[snp_str] = len(ss)
            sites.append([seq, int(site), ss[snp_str]])
    states = []
    for s, id in sorted(ss.items(), key=lambda x: x[1]):
        states.append(np.array(s.split('\t')).view(asc2int))
    return names, np.array(states), sites
Exemple #27
0
def readAncestral(fname, nSample) :
    sites = []
    with uopen(fname) as fin :
        for line in fin :
            if line.startswith('##') :
                pass
            elif line.startswith('#') :
                header = line.strip().split('\t')
                nodes = np.array([not h.startswith('#') for h in header], dtype=bool)
                nodeNames = np.array(header)[nodes]
                mat = pd.read_csv(fin, sep='\t', header=None, dtype=str).values
                break
    mat = mat[np.in1d(mat.T[nodes][0], ['A', 'C', 'G', 'T'])]
    
    conv = np.bincount([ 67,  71, 71,  84, 84, 84])
    data = conv[mat[:, nodes].astype(bytes).view(np.int8)]
    for id, (cont, site) in enumerate(mat[:, :2]) :
        sites.append([(cont, int(site)), ] + [1.]*nSample)
    return np.array(sites), nodeNames, np.ascontiguousarray(data.T, dtype=np.int8)
Exemple #28
0
def readMutations(mutationFile):
    accLength = 0
    seqRange = {}
    missingBlocks = []
    with uopen(mutationFile) as fin:
        for line in fin:
            if line.startswith('##'):
                if line.startswith('## Sequence_length:'):
                    part = line.strip().split()
                    seqRange[part[2]] = [accLength, accLength + int(part[3])]
                    accLength += int(part[3])
                elif line.startswith('## Missing_region:'):
                    part = line.strip().split()
                    acc = seqRange[part[2]][0]
                    s, e = int(part[3]) + acc, int(part[4]) + acc
                    if [s - 1, e] == seqRange[part[2]]:
                        diff = e - s + 1
                        seqRange.pop(part[2])
                        for n in seqRange:
                            if seqRange[n][0] >= e: seqRange[n][0] -= diff
                            if seqRange[n][1] >= e: seqRange[n][1] -= diff
                        for blk in missingBlocks:
                            if blk[0] >= e: blk[0] -= diff
                            if blk[1] >= e: blk[1] -= diff
                    else:
                        if e - s + 1 >= 500:
                            missingBlocks.append([s, e])
            elif line.startswith('#'):
                mat = pd.read_csv(fin, sep='\t', header=None, dtype=str).values
                break
    mat = mat[np.vectorize(lambda x: len(x))(mat.T[4]) == 4]
    sites = np.vstack([
        np.vectorize(lambda x: seqRange.get(x, [-1])[0])(mat.T[1]),
        mat.T[2].astype(int)
    ])
    mat = mat[sites[0] >= 0]
    sites = np.sum(sites[:, sites[0] >= 0], 0)
    mutBlocks = list(zip(mat.T[0], sites))
    return seqRange, missingBlocks, mutBlocks
Exemple #29
0
def readAncestral(fname,):
    sites = []
    data = []
    conv = np.bincount([65, 67, 67, 71, 71, 71, 84, 84, 84, 84]) - 1
    sys.stderr.write('Start reading Matrix: \n')
    with uopen(fname) as fin:
        for line in fin:
            if line.startswith('##'):
                pass
            elif line.startswith('#'):
                header = line.strip().split('\t')
                nodes = np.array([not h.startswith('#') for h in header], dtype=bool)
                nodeNames = np.array(header)[nodes]
                break
        for i, mat in enumerate(pd.read_csv(fin, sep='\t', header=None, dtype=str, chunksize=20000)):
            sys.stderr.write('Reading Matrix - Line: {0}      \r'.format(i * 20000))
            mat = mat.values
            mat = mat[np.in1d(mat[:, np.where(nodes)[0][0]], ['A', 'C', 'G', 'T'])]
            data.append(conv[np.vectorize(ord)(mat[:, nodes])])
            for id, (cont, site) in enumerate(mat[:, :2]):
                sites.append([(cont, int(site)), 1. ])
    sys.stderr.write('Read Matrix DONE. Total SNP sites: {0}                \n'.format(len(sites)))
    return np.array(sites), nodeNames, np.ascontiguousarray(np.vstack(data).T, dtype=np.uint8)
Exemple #30
0
def getMatrix(prefix, reference, alignments, core, matrixOut, alignmentOut) :
    refSeq, refQual = readFastq(reference)
    coreSites = { n:np.zeros(len(refSeq[n]), dtype=int) for n in refSeq }
    matSites = { n:np.zeros(len(refSeq[n]), dtype=int) for n in refSeq }
    alnId = { aln[0]:id for id, aln in enumerate(alignments) }
    res = pool.map(readMap, alignments)
    
    matrix = {}
    for presences, absences, mutations in res :
        for mut in mutations :
            j = alnId[mut[0]]
            site = tuple(mut[1:3])
            if site not in matrix :
                matrix[site] = [[], []]
                matSites[mut[1]][mut[2]-1] = mut[2]
            if len(mut[4]) == 1 :
                if len(matrix[site][0]) == 0 :
                    matrix[site][0] = ['-' for id in alnId]
                matrix[site][0][j] = mut[4]
            else :
                if len(matrix[site][1]) == 0 :
                    matrix[site][1] = ['-' for id in alnId]
                matrix[site][1][j] = mut[4]
    for (mTag, mFile), (presences, absences, mutations) in zip(alignments, res) :
        j = alnId[mTag]
        for n, s, e in presences :
            coreSites[n][s-1:e] +=1
            mutations = matSites[n][s-1:e]
            for kk in mutations[mutations > 0] :
                k = (n, kk)
                if len(matrix[k][0]) and matrix[k][0][j] == '-' :
                    matrix[k][0][j] = '.'
                if len(matrix[k][1]) and matrix[k][1][j] == '-' :
                    matrix[k][1][j] = '.'
        for n, s, e, m in absences :
            coreSites[n][s-1:e] -=1
            mutations = matSites[n][s-1:e]
            for kk in mutations[mutations > 0] :
                k = (n, kk)
                if len(matrix[k][0]) and matrix[k][0][j] == '.' :
                    matrix[k][0][j] = '-'
                if len(matrix[k][1]) and matrix[k][1][j] == '.' :
                    matrix[k][1][j] = '-'
    pres = np.unique(np.concatenate(list(coreSites.values())), return_counts=True)
    pres = [pres[0][pres[0] > 0], pres[1][pres[0] > 0]]
    coreNum = len(alignments) * core
    for p, n in zip(*pres) :
        sys.stderr.write('#{2} {0} {1}\n'.format(p, n, '' if p > coreNum else '#'))

    missings = []
    coreBases = {'A':0, 'C':0, 'G':0, 'T':0}
    for n in sorted(coreSites) :
        sites = coreSites[n]
        for site, num in enumerate(sites) :
            cSite = (n, site+1)
            if num < coreNum and cSite in matrix and len(matrix[cSite][1]) > 0 :
                num = np.sum(matrix[cSite][1] != '-')
                matrix[cSite][0] = []
            if num < coreNum :
                matrix.pop(cSite, None)
                if len(missings) == 0 or missings[-1][0] != n or missings[-1][2] + 1 < cSite[1] :
                    missings.append([n, cSite[1], cSite[1]])
                else :
                    missings[-1][2] = cSite[1]
            else :
                b = refSeq[n][cSite[1]-1]
                if cSite in matrix and len(matrix[cSite][0]) :
                    matrix[cSite][0] = [ (b if s == '.' else s) for s in matrix[cSite][0]]
                else :
                    coreBases[b] = coreBases.get(b, 0) + 1
                    
    outputs = {}
    if matrixOut :
        outputs['matrix'] = prefix + '.matrix.gz'
        with uopen(prefix + '.matrix.gz', 'w') as fout :
            fout.write('## Constant_bases: {A} {C} {G} {T}\n'.format(**coreBases))
            for n in refSeq :
                fout.write('## Sequence_length: {0} {1}\n'.format(n, len(refSeq[n])))
            for region in missings :
                fout.write('## Missing_region: {0} {1} {2}\n'.format(*region))
            fout.write('\t'.join(['#Seq', '#Site'] + [ mTag for mTag, mFile in alignments ]) + '\n')
            for site in sorted(matrix) :
                bases = matrix[site]
                if len(bases[0]) :
                    fout.write('{0}\t{1}\t{2}\n'.format(site[0], site[1], '\t'.join(bases[0])))
                if len(bases[1]) :
                    fout.write('{0}\t{1}\t{2}\n'.format(site[0], site[1], '\t'.join(bases[1])))
    if alignmentOut :
        outputs['alignment'] = prefix + '.fasta.gz'
        sequences = []
        for (mTag, mFile), (presences, absences, mutations) in zip(alignments, res) :
            j = alnId[mTag]
            seq = { n:['-']*len(s) for n, s in refSeq.items() } if j > 0 else { n:list(s) for n, s in refSeq.items() }
            if j :
                for n, s, e in presences :
                    seq[n][s-1:e] = refSeq[n][s-1:e]
                for n, s, e, c in absences :
                    seq[n][s-1:e] = '-' * (e-s+1)
            for site in matrix :
                bases = matrix[site]
                if len(bases[0]) :
                    seq[site[0]][site[1]-1] = bases[0][j]
            sequences.append(seq)
        with uopen(prefix + '.fasta.gz', 'w') as fout :
            for id, n in enumerate(sorted(refSeq)) :
                if id :
                    fout.write('=\n')
                for (mTag, mFile), seq in zip(alignments, sequences) :
                    fout.write('>{0}:{1}\n{2}\n'.format(mTag, n, ''.join(seq[n])))
    return outputs