def main():
    parser = OptionParser(
        usage='%prog [options] alignment_path family_accession')
    (options, args) = parser.parse_args()
    if len(args) != 2:
        parser.error('Must supply alignment_path and family_accession')
    if not os.path.exists(args[0]):
        parser.error('Alignment_path %s not found' % args[0])
    alignment_path = args[0]
    accession_re = re.compile('bpg\d\d\d\d\d\d\d')
    m = accession_re.match(args[1])
    if not m or len(m.group(0)) != len(args[1]):
        parser.error('%s is not a valid BPG family accession' % args[1])
    family_accession = args[1]
    if insertMLTree(alignment_path, family_accession):
        print "Success: ML tree was inserted."
    else:
        print "Error: ML tree was NOT inserted!"
Example #2
0
def main():
    # parse command line options
    usage = "%prog [options] bpg_accession"
    opt_parser = OptionParser(usage=usage)
    (options, args) = opt_parser.parse_args()
    if len(args) != 1:
        opt_parser.error('Incorrect number of arguments')
    if len(args[0]) != 10 or args[0][0:3] != 'bpg':
        opt_parser.error('Argument must be a bpg accession like bpg0164198')
    bpg_accession = args[0]
    try:
        family_id = int(bpg_accession[3:])
    except ValueError:
        opt_parser.error('Argument must be a bpg accession like bpg0164198')
    family_dir = '/clusterfs/ohana/bpg/pfacts/%s/%s/%s' % (
        bpg_accession[0:4], bpg_accession[0:7], bpg_accession)
    if not os.path.exists(family_dir):
        opt_parser.error('Family %s not found on the filesystem.' %
                         bpg_accession)

    print "Polishing family %s in %s" % (bpg_accession, family_dir)
    os.chdir(family_dir)

    overall_starttime = time.time()

    if not os.path.exists(bpg_accession + '.alignmentconservation.csv'):
        starttime = time.time()
        print 'computing alignment conservation and inserting...',
        sys.stdout.flush()

        errf = open('compute_alignment_conservation.err', 'w')
        p = subprocess.Popen(
            ['compute_alignment_conservation.py', bpg_accession], stderr=errf)

        status = os.waitpid(p.pid, 0)[1]

        errf.close()

        endtime = time.time()
        print 'done. %s' % getTimeStr(starttime, endtime)

    f = open(bpg_accession + '.hmm')
    line = f.readline().strip()
    f.close()

    if line != 'HMMER3/b [3.0 | March 2010]':
        starttime = time.time()
        print "Old HMMER3 version line: ", line
        print 'Rebuilding the HMM using the production version of HMMER3...',
        sys.stdout.flush()

        stockholm_fname = glob.glob('*.stockholm')[0]
        outf = open('hmmer3build.out', 'w')
        errf = open('hmmer3build.err', 'w')
        cmd = 'hmm3build -n %s --hand %s.hmm %s' \
              % (bpg_accession, bpg_accession, stockholm_fname)
        p = subprocess.Popen(shlex.split(cmd), stdout=outf, stderr=errf)

        status = os.waitpid(p.pid, 0)[1]

        errf.close()
        outf.close()

        endtime = time.time()
        print 'done. %s' % getTimeStr(starttime, endtime)

    fasttrees = glob.glob('*fasttree*')

    if len(fasttrees) == 0:
        print 'No FastTree ML tree found.  Will create.'
        starttime = time.time()
        print 'Reformatting alignment...',
        sys.stdout.flush()
        errf = open('align_nj_to_align_ml.err', 'w')
        p = subprocess.Popen('align_nj_to_align_ml.py', stderr=errf)

        status = os.waitpid(p.pid, 0)[1]

        errf.close()

        endtime = time.time()
        print 'done. %s' % getTimeStr(starttime, endtime)

        starttime = time.time()
        print 'Running FastTree...',
        sys.stdout.flush()

        unrooted_tree_name = 'final_trimmed_mafft_ungapped.fasttree.ml.tre'
        outf = open(unrooted_tree_name, 'w')
        errf = open('final_trimmed_mafft_ungapped.fasttree.log', 'w')
        p = subprocess.Popen(
            ['FastTree', '-gamma', 'final_trimmed_mafft_ungapped.aln.ml'],
            stdout=outf,
            stderr=errf)

        status = os.waitpid(p.pid, 0)[1]

        errf.close()
        outf.close()

        endtime = time.time()
        print 'done. %s' % getTimeStr(starttime, endtime)

        starttime = time.time()
        print 'Rerooting tree at midpoint of the longest span...',
        sys.stdout.flush()

        outf = open('final_trimmed_mafft_ungapped.fasttree.ml.rooted.tre', 'w')
        errf = open('midpoint_reroot.err', 'w')
        cmd = 'export PYTHONPATH=/clusterfs/ohana/software/lib/python2.6; ' \
              + 'midpoint_reroot.py %s ' % unrooted_tree_name
        p = subprocess.Popen(cmd, shell=True, stdout=outf, stderr=errf)

        status = os.waitpid(p.pid, 0)[1]

        errf.close()
        outf.close()

        endtime = time.time()
        print 'done. %s' % getTimeStr(starttime, endtime)

        starttime = time.time()
        print 'Inserting ML tree...',
        sys.stdout.flush()

        status = insertMLTree('final_trimmed_mafft_ungapped.afa',
                              bpg_accession)

        if not status:
            sys.stderr.write("Error: ML tree was NOT inserted!\n")

        endtime = time.time()
        print 'done. %s' % getTimeStr(starttime, endtime)

        starttime = time.time()
        print 'Running find_orthologs...',
        sys.stdout.flush()

        outf = open('%s.find_orthologs.ml.out' % bpg_accession, 'w')
        errf = open('%s.find_orthologs.ml.err' % bpg_accession, 'w')
        cmd = 'find_orthologs --book %s --method ml' % bpg_accession

        p = subprocess.Popen(shlex.split(cmd), stdout=outf, stderr=errf)

        status = os.waitpid(p.pid, 0)[1]

        errf.close()
        outf.close()

        endtime = time.time()
        print 'done. %s' % getTimeStr(starttime, endtime)

        starttime = time.time()
        print 'Running find_thresholded_phogs_django.py...',
        sys.stdout.flush()

        outf = open('%s.find_thresholded_phogs.ml.out' % bpg_accession, 'w')
        errf = open('%s.find_thresholded_phogs.ml.err' % bpg_accession, 'w')
        cmd = 'find_thresholded_phogs_django.py -m ml %s' % bpg_accession

        p = subprocess.Popen(shlex.split(cmd), stdout=outf, stderr=errf)

        status = os.waitpid(p.pid, 0)[1]

        errf.close()
        outf.close()

        endtime = time.time()
        print 'done. %s' % getTimeStr(starttime, endtime)

    overall_endtime = time.time()
    print "Done polishing family %s. Overall: %s" \
        % (bpg_accession, getTimeStr(overall_starttime, overall_endtime))

    p = subprocess.Popen(['touch', 'polish_family_done'])
    status = os.waitpid(p.pid, 0)[1]
def main():
    # parse command line options
    usage = "%prog [options] bpg_accession"
    opt_parser = OptionParser(usage=usage)
    (options, args) = opt_parser.parse_args()
    if len(args) != 1:
        opt_parser.error('Incorrect number of arguments')
    if len(args[0]) != 10 or args[0][0:3] != 'bpg':
        opt_parser.error('Argument must be a bpg accession like bpg0164198')
    bpg_accession = args[0]
    try:
        family_id = int(bpg_accession[3:])
    except ValueError:
        opt_parser.error('Argument must be a bpg accession like bpg0164198')
    family_dir = '/clusterfs/ohana/bpg/pfacts/%s/%s/%s' % (
        bpg_accession[0:4], bpg_accession[0:7], bpg_accession)
    if not os.path.exists(family_dir):
        opt_parser.error('Family %s not found on the filesystem.' %
                         bpg_accession)

    print "Polishing family %s in %s" % (bpg_accession, family_dir)
    os.chdir(family_dir)

    overall_starttime = time.time()

    unaligned_seqs_file = '%s.seqs.fa' % bpg_accession
    blastable_database_index = '%s.pin' % unaligned_seqs_file

    if not os.path.exists(blastable_database_index):
        print 'No BLASTable database found.  Will create.'
        starttime = time.time()
        print 'Creating file of unaligned sequences from %s.' % bpg_accession
        sys.stdout.flush()
        p = subprocess.Popen([
            'alignedfasta_to_unalignedfasta.py',
            '%s.a2m' % bpg_accession,
            '%s.seqs.fa' % bpg_accession
        ],
                             stderr=subprocess.PIPE)
        status = os.waitpid(p.pid, 0)[1]

        endtime = time.time()
        print 'done. %s' % getTimeStr(starttime, endtime)
        starttime = time.time()
        print 'Creating BLASTable database of sequences from %s.' % bpg_accession
        sys.stdout.flush()
        p = subprocess.Popen([
            'formatdb', '-o', 'T', '-l', 'formatdb.log', '-i',
            '%s.seqs.fa' % bpg_accession
        ],
                             stderr=subprocess.PIPE)
        status = os.waitpid(p.pid, 0)[1]

        endtime = time.time()
        print 'done. %s' % getTimeStr(starttime, endtime)

    fasttrees = glob.glob('*fasttree*')

    if len(fasttrees) == 0:
        print 'No FastTree ML tree found.  Will create.'
        starttime = time.time()
        print 'Reformatting alignment...',
        sys.stdout.flush()
        errf = open('align_nj_to_align_ml.err', 'w')
        p = subprocess.Popen('align_nj_to_align_ml.py', stderr=errf)

        status = os.waitpid(p.pid, 0)[1]

        errf.close()

        endtime = time.time()
        print 'done. %s' % getTimeStr(starttime, endtime)

        starttime = time.time()
        print 'Running FastTree...',
        sys.stdout.flush()

        unrooted_tree_name = 'final_trimmed_mafft_ungapped.fasttree.ml.tre'
        outf = open(unrooted_tree_name, 'w')
        errf = open('final_trimmed_mafft_ungapped.fasttree.log', 'w')
        p = subprocess.Popen(
            ['FastTree', '-gamma', 'final_trimmed_mafft_ungapped.aln.ml'],
            stdout=outf,
            stderr=errf)

        status = os.waitpid(p.pid, 0)[1]

        errf.close()
        outf.close()

        endtime = time.time()
        print 'done. %s' % getTimeStr(starttime, endtime)

        starttime = time.time()
        print 'Rerooting tree at midpoint of the longest span...',
        sys.stdout.flush()

        outf = open('final_trimmed_mafft_ungapped.fasttree.ml.rooted.tre', 'w')
        errf = open('midpoint_reroot.err', 'w')
        cmd = 'export PYTHONPATH=/clusterfs/ohana/software/lib/python2.6; ' \
              + 'midpoint_reroot.py %s ' % unrooted_tree_name
        p = subprocess.Popen(cmd, shell=True, stdout=outf, stderr=errf)

        status = os.waitpid(p.pid, 0)[1]

        errf.close()
        outf.close()

        endtime = time.time()
        print 'done. %s' % getTimeStr(starttime, endtime)

    tree_objects = Tree.objects.filter(family__id=family_id, method='ml')
    if not tree_objects:
        starttime = time.time()
        print 'Inserting ML tree...',
        sys.stdout.flush()

        status = insertMLTree('final_trimmed_mafft_ungapped.afa',
                              bpg_accession)

        if not status:
            sys.stderr.write("Error: ML tree was NOT inserted!\n")

        endtime = time.time()
        print 'done. %s' % getTimeStr(starttime, endtime)

    if not os.path.exists('ml_tree_ordered.a2m'):
        print 'Tree-ordered alignment file not found, creating.'
        family = Family.objects.get(id=family_id)

        aligned_seq_of_seq_header = {}
        tree_node_alignment_objects = TreeNodeAlignment.objects.filter(
            tree_node=family.canonical_root_node())
        starttime = time.time()
        print 'Reading alignment from database...',
        sys.stdout.flush()
        for object in tree_node_alignment_objects:
            aligned_seq_of_seq_header[object.sequence_header] \
                = object.aligned_sequence
        endtime = time.time()
        print 'done. %s' % getTimeStr(starttime, endtime)
        tree = Tree.objects.get(family__id=family_id, method='ml')
        starttime = time.time()
        print 'Reading order from database...',
        sys.stdout.flush()
        leaves = TreeNode.objects.filter(
            tree=tree, sequence_header__isnull=False).order_by('left_id')
        endtime = time.time()
        print 'done. %s' % getTimeStr(starttime, endtime)
        starttime = time.time()
        print 'Writing reordered alignment...',
        sys.stdout.flush()
        f = open('ml_tree_ordered.a2m', 'w')
        for leaf in leaves:
            f.write('>%s\n' % leaf.sequence_header.header)
            f.write('%s\n' %
                    aligned_seq_of_seq_header[leaf.sequence_header].chars)
        f.close()
        cmd = 'rm %s.a2m; ln -s ml_tree_ordered.a2m %s.a2m' % (bpg_accession,
                                                               bpg_accession)
        p = subprocess.Popen(cmd, shell=True)
        status = os.waitpid(p.pid, 0)[1]
        starttime = time.time()
        print 'done...',
        sys.stdout.flush()

    overall_endtime = time.time()
    print "Done polishing family %s. Overall: %s" \
        % (bpg_accession, getTimeStr(overall_starttime, overall_endtime))

    p = subprocess.Popen(['touch', 'polish_family_20101223_done'])
    status = os.waitpid(p.pid, 0)[1]