Exemple #1
0
def prepare_g2p(test_case):
    if os.path.exists('../g2p/g2p_fpr.txt'):
        test_case.pssm = Pssm(path_to_lookup='../g2p/g2p_fpr.txt',
                              path_to_matrix='../g2p/g2p.matrix')
    else:
        test_case.pssm = Pssm(path_to_lookup='micall/g2p/g2p_fpr.txt',
                              path_to_matrix='micall/g2p/g2p.matrix')

    test_case.g2p_csv = DummyFile()
    test_case.g2p_summary_csv = DummyFile()
Exemple #2
0
    def test_multiple_sequences(self):
        pssm = Pssm()
        nucs = [('TGTACAAGACCCAACAACAATACAAGAAAAAGTATACATATAGGACCAGGGAG'
                 'AGCATTTTATGCAACAGGAGAAATAATAGGAGATATAAGACAAGCACATTGT'),
                ('TGTACAAGTCCCAACAACAATACAAGAAAAAGTATACATATAGGACCAGGGAG'
                 'AGCATTTTATGCAACAGGAGAAATAATAGGAGATATAAGACAAGCACATTGT')]
        expected_aa = None  # Not returned when submitting more than one seq.
        expected_scores = [0.06775, 0.06486]

        scores, aligned_aa = pssm.run_g2p(nucs)

        rounded_scores = [round(score, 5) for score in scores]
        self.assertEqual(expected_aa, aligned_aa)
        self.assertEqual(expected_scores, rounded_scores)
Exemple #3
0
    def test_multiple_sequences(self):
        pssm = Pssm()
        nucs = [('TGTACAAGACCCAACAACAATACAAGAAAAAGTATACATATAGGACCAGGGAG'
                 'AGCATTTTATGCAACAGGAGAAATAATAGGAGATATAAGACAAGCACATTGT'),
                ('TGTACAAGTCCCAACAACAATACAAGAAAAAGTATACATATAGGACCAGGGAG'
                 'AGCATTTTATGCAACAGGAGAAATAATAGGAGATATAAGACAAGCACATTGT')]
        expected_aa = None  # Not returned when submitting more than one seq.
        expected_scores = [0.06775, 0.06486]

        scores, aligned_aa = pssm.run_g2p(nucs)

        rounded_scores = [round(score, 5) for score in scores]
        self.assertEqual(expected_aa, aligned_aa)
        self.assertEqual(expected_scores, rounded_scores)
Exemple #4
0
def process_run(run_info, args):
    pssm = Pssm()

    for filename in os.listdir(run_info.scratch_path):
        filepath = os.path.join(run_info.scratch_path, filename)
        if os.path.isdir(filepath):
            shutil.rmtree(filepath)
        else:
            os.remove(filepath)

    if run_info.interop_path is None:
        run_summary = None
    else:
        logger.info('Summarizing run.')
        run_summary = summarize_run(run_info)

    with ProcessPoolExecutor(max_workers=args.max_active) as pool:
        for _ in pool.map(functools.partial(process_sample,
                                            args=args,
                                            pssm=pssm,
                                            use_denovo=run_info.is_denovo),
                          run_info.get_all_samples()):
            pass

        for _ in pool.map(functools.partial(process_resistance,
                                            run_info=run_info),
                          run_info.sample_groups):
            pass

    collate_samples(run_info)
    if run_summary is not None:
        summarize_samples(run_info, run_summary)
    if not args.keep_scratch:
        shutil.rmtree(run_info.scratch_path, ignore_errors=True)
    logger.info('Done.')
Exemple #5
0
def main():
    logger.info("Starting on %s with %d CPU's.", socket.gethostname(),
                multiprocessing.cpu_count())
    args = parse_args()
    if args.link_run is not None:
        json = link_json(args.link_run, args.data_path)
    else:
        json_path = os.path.join(args.data_path, 'input', 'AppSession.json')
        with open(json_path, 'rU') as json_file:
            json = parse_json(json_file)
    pssm = Pssm()

    scratch_path = os.path.join(args.data_path, 'scratch')
    makedirs(scratch_path)
    for filename in os.listdir(scratch_path):
        filepath = os.path.join(scratch_path, filename)
        if os.path.isdir(filepath):
            shutil.rmtree(filepath)
        else:
            os.remove(filepath)

    if json.run_id is not None:
        logger.info('Summarizing run.')
        run_summary = summarize_run(args, json)

    pool = Pool()
    pool.map(
        functools.partial(try_sample,
                          run_info=json,
                          data_path=args.data_path,
                          pssm=pssm), range(len(json.samples)))

    if json.run_id is not None:
        summarize_samples(args, json, run_summary)
    logger.info('Done.')
Exemple #6
0
    def test_single_sequence(self):
        pssm = Pssm()
        nucs = ('TGTACAAGACCCAACAACAATACAAGAAAAAGTATACATATAGGACCAGGGAG'
                'AGCATTTTATGCAACAGGAGAAATAATAGGAGATATAAGACAAGCACATTGT')
        expected_aa = [['C'], ['T'], ['R'], ['P'], ['N'], ['-'], ['N'], ['N'],
                       ['T'], ['-'], ['-'], ['R'], ['K'], ['S'], ['I'], ['H'],
                       ['I'], ['-'], ['-'], ['-'], ['G'], ['P'], ['G'], ['R'],
                       ['-'], ['-'], ['-'], ['A'], ['F'], ['Y'], ['A'], ['T'],
                       ['-'], ['-'], ['-'], ['-'], ['G'], ['E'], ['I'], ['I'],
                       ['G'], ['D'], ['I'], ['-'], ['-'], ['R'], ['Q'], ['A'],
                       ['H'], ['C']]
        expected_score = 0.067753

        score, aligned_aa = pssm.run_g2p(nucs)

        self.assertEqual(expected_aa, aligned_aa)
        self.assertAlmostEqual(expected_score, score, places=5)
Exemple #7
0
    def test_single_sequence(self):
        pssm = Pssm()
        nucs = ('TGTACAAGACCCAACAACAATACAAGAAAAAGTATACATATAGGACCAGGGAG'
                'AGCATTTTATGCAACAGGAGAAATAATAGGAGATATAAGACAAGCACATTGT')
        expected_aa = [['C'], ['T'], ['R'], ['P'], ['N'], ['-'], ['N'], ['N'],
                       ['T'], ['-'], ['-'], ['R'], ['K'], ['S'], ['I'], ['H'],
                       ['I'], ['-'], ['-'], ['-'], ['G'], ['P'], ['G'], ['R'],
                       ['-'], ['-'], ['-'], ['A'], ['F'], ['Y'], ['A'], ['T'],
                       ['-'], ['-'], ['-'], ['-'], ['G'], ['E'], ['I'], ['I'],
                       ['G'], ['D'], ['I'], ['-'], ['-'], ['R'], ['Q'], ['A'],
                       ['H'], ['C']]
        expected_score = 0.067753

        score, aligned_aa = pssm.run_g2p(nucs)

        self.assertEqual(expected_aa, aligned_aa)
        self.assertAlmostEqual(expected_score, score, places=5)
Exemple #8
0
    def setUp(self):
        super(SamG2PTest, self).setUp()
        if os.path.exists('../g2p/g2p_fpr.txt'):
            self.pssm = Pssm(path_to_lookup='../g2p/g2p_fpr.txt',
                             path_to_matrix='../g2p/g2p.matrix')
        else:
            self.pssm = Pssm(path_to_lookup='micall/g2p/g2p_fpr.txt',
                             path_to_matrix='micall/g2p/g2p.matrix')

        self.nuc_csv = StringIO("""\
seed,region,q-cutoff,query.nuc.pos,refseq.nuc.pos,A,C,G,T
HIV1B-env-seed,V3LOOP,15,877,1,0,0,0,100
HIV1B-env-seed,V3LOOP,15,981,105,0,0,0,100
""")
        self.g2p_csv = DummyFile()
        self.g2p_summary_csv = DummyFile()
        self.addTypeEqualityFunc(str, self.assertMultiLineEqual)
Exemple #9
0
    def test_ambiguous_sequence(self):
        pssm = Pssm()
        nucs = ('TGTACAAGWCCCAACAACAATACAAGAAAAAGTATACATATAGGACCAGGGAG'
                'AGCATTTTATGCAACAGGAGAAATAATAGGAGATATAAGACAAGCACATTGT')
        expected_aa = [['C'], ['T'], ['R', 'S'], ['P'], ['N'], ['-'], ['N'],
                       ['N'], ['T'], ['-'], ['-'], ['R'], ['K'], ['S'], ['I'],
                       ['H'], ['I'], ['-'], ['-'], ['-'], ['G'], ['P'], ['G'],
                       ['R'], ['-'], ['-'], ['-'], ['A'], ['F'], ['Y'], ['A'],
                       ['T'], ['-'], ['-'], ['-'], ['-'], ['G'], ['E'], ['I'],
                       ['I'], ['G'], ['D'], ['I'], ['-'], ['-'], ['R'], ['Q'],
                       ['A'], ['H'], ['C']]
        # Average of two possible scores (see test_multiple_sequences).
        expected_score = (0.06775 + 0.06486) / 2

        score, aligned_aa = pssm.run_g2p(nucs)

        self.assertEqual(expected_aa, aligned_aa)
        self.assertAlmostEqual(expected_score, score, places=5)
Exemple #10
0
    def test_ambiguous_sequence(self):
        pssm = Pssm()
        nucs = ('TGTACAAGWCCCAACAACAATACAAGAAAAAGTATACATATAGGACCAGGGAG'
                'AGCATTTTATGCAACAGGAGAAATAATAGGAGATATAAGACAAGCACATTGT')
        expected_aa = [['C'], ['T'], ['R', 'S'], ['P'], ['N'], ['-'], ['N'], ['N'],
                       ['T'], ['-'], ['-'], ['R'], ['K'], ['S'], ['I'], ['H'],
                       ['I'], ['-'], ['-'], ['-'], ['G'], ['P'], ['G'], ['R'],
                       ['-'], ['-'], ['-'], ['A'], ['F'], ['Y'], ['A'], ['T'],
                       ['-'], ['-'], ['-'], ['-'], ['G'], ['E'], ['I'], ['I'],
                       ['G'], ['D'], ['I'], ['-'], ['-'], ['R'], ['Q'], ['A'],
                       ['H'], ['C']]
        # Average of two possible scores (see test_multiple_sequences).
        expected_score = (0.06775 + 0.06486) / 2

        score, aligned_aa = pssm.run_g2p(nucs)

        self.assertEqual(expected_aa, aligned_aa)
        self.assertAlmostEqual(expected_score, score, places=5)
Exemple #11
0
def main():
    args = parse_args()
    from micall.g2p.pssm_lib import Pssm
    pssm = Pssm()
    sam_g2p(pssm=pssm,
            remap_csv=args.remap_csv,
            nuc_csv=args.nuc_csv,
            g2p_csv=args.g2p_csv,
            g2p_summary_csv=args.g2p_summary_csv,
            min_count=DEFAULT_MIN_COUNT)
Exemple #12
0
def main():
    logging.basicConfig(level=logging.WARN)
    args = parse_args()
    sample = load_sample(args)

    pssm = Pssm()
    sample.process(pssm, force_gzip=True)  # dataset files change .gz to .raw

    with tarfile.open(args.coverage_maps_tar, mode='w') as tar:
        for image_name in os.listdir(sample.coverage_maps):
            image_path = os.path.join(sample.coverage_maps, image_name)
            archive_path = os.path.join('coverage_maps', image_name)
            tar.add(image_path, archive_path)
Exemple #13
0
def main():
    args = parse_args()
    from micall.g2p.pssm_lib import Pssm
    pssm = Pssm()
    fastq_g2p(pssm=pssm,
              fastq1=args.fastq1,
              fastq2=args.fastq2,
              g2p_csv=args.g2p_csv,
              g2p_summary_csv=args.g2p_summary_csv,
              unmapped1=args.unmapped1,
              unmapped2=args.unmapped2,
              aligned_csv=args.aligned_csv,
              min_count=DEFAULT_MIN_COUNT,
              min_valid=MIN_VALID,
              min_valid_percent=MIN_VALID_PERCENT)
Exemple #14
0
 def __init__(self, filename1, bad_cycles_filename):
     super(MicallDD, self).__init__()
     if True or 'filter' in filename1:
         self.filename1 = filename1
     else:
         self.filename1 = self.filter_fastqs(filename1)
     self.bad_cycles_filename = bad_cycles_filename
     self.pssm = Pssm()
     reads = defaultdict(list)
     read_fastq(self.filename1, reads)
     read_count = len(reads)
     read_fastq(get_reverse_filename(self.filename1), reads)
     added_count = len(reads) - read_count
     if added_count > 0:
         raise RuntimeError('Found {} new reads.'.format(added_count))
     self.reads = reads.values()
Exemple #15
0
def main():
    logger.info("Starting on %s with %d CPU's.",
                socket.gethostname(),
                multiprocessing.cpu_count())
    args = parse_args()
    if args.link_run is not None:
        run_info = link_samples(args.link_run, args.data_path)
    else:
        run_info = load_samples(args.data_path)
    pssm = Pssm()

    for filename in os.listdir(run_info.scratch_path):
        filepath = os.path.join(run_info.scratch_path, filename)
        if os.path.isdir(filepath):
            shutil.rmtree(filepath)
        else:
            os.remove(filepath)

    if run_info.interop_path is None:
        run_summary = None
    else:
        logger.info('Summarizing run.')
        run_summary = summarize_run(run_info)

    pool = Pool()
    pool.map(functools.partial(process_sample,
                               args=args,
                               pssm=pssm),
             run_info.get_all_samples())

    pool.close()
    pool.join()
    pool = Pool()
    pool.map(functools.partial(process_resistance,
                               run_info=run_info),
             run_info.sample_groups)

    pool.close()
    pool.join()
    collate_samples(run_info)
    if run_summary is not None:
        summarize_samples(run_info, run_summary)
    logger.info('Done.')
Exemple #16
0
             simple_prefix,
             pssm,
             ruby_script,
             delete_results=False)
        if not txtfilename.endswith('.txt'):
            with open(simple_prefix + '.txt', 'w') as simplefile:
                for line in simple_remap_lines:
                    simplefile.write(line)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Find the simplest test failure by trimming SAM files.')

    parser.add_argument('workdir', help='path to folder holding SAM files')
    parser.add_argument('ruby_script', help='path to Ruby version of G2P')
    parser.add_argument('--pattern',
                        default='*.remap.csv',
                        help='File name pattern to match SAM files')

    args = parser.parse_args()

    logger = init_logging_console_only(logging.INFO)
    pssm = Pssm(path_to_lookup='../g2p/g2p_fpr.txt',
                path_to_matrix='../g2p/g2p.matrix')
    for txtfilename in sorted(
            glob.glob(os.path.join(args.workdir, args.pattern))):
        logger.info(os.path.basename(txtfilename))
        compare_conseqs(txtfilename, args.ruby_script, pssm)
    logger.info('Done.')
Exemple #17
0
    def __init__(self, parent, *args, **kwargs):
        self.pssm = Pssm(
            path_to_lookup=AssetWrapper('micall/g2p/g2p_fpr.txt').path,
            path_to_matrix=AssetWrapper('micall/g2p/g2p.matrix').path)

        tk.Frame.__init__(self, parent, *args, **kwargs)
        self.parent = parent
        parent.report_callback_exception = self.report_callback_exception

        self.rundir = None  # path to MiSeq run folder containing data
        self.workdir = gettempdir()  # default to temp directory
        os.chdir(self.workdir)

        self.line_counter = LineCounter()

        self.run_info = None
        self.target_files = []

        self.button_frame = tk.Frame(self)
        self.button_frame.pack(side='top')
        self.console_frame = tk.Frame(self)
        self.console_frame.pack(side='top', fill='both', expand=True)

        try:
            with open(MiCall.CONFIG_FILE, 'rU') as f:
                self.config = json.load(f)
        except:
            self.config = {}

        self.nthreads = self.config.get('threads', None)
        if not self.nthreads:
            self.nthreads = int(round(cpu_count() * 0.5))
            self.config['threads'] = self.nthreads
            self.write_config()

        self.button_run = tk.Button(self.button_frame,
                                    text="Run",
                                    command=self.process_files)
        self.button_run.grid(row=0, column=1, sticky='W')

        self.progress_bar = Progressbar(self.button_frame,
                                        orient='horizontal',
                                        length=500,
                                        mode='determinate')
        self.progress_bar.grid(row=1, columnspan=5)

        scrollbar = tk.Scrollbar(self.console_frame)
        scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
        self.console = tk.Text(self.console_frame,
                               bg='black',
                               fg='white',
                               yscrollcommand=scrollbar.set)
        self.console.pack(side=tk.LEFT, fill=tk.BOTH)
        self.console.tag_configure('ERROR', foreground="red")
        scrollbar.config(command=self.console.yview)

        # redirect stderr to Text widget
        #sys.stderr = Redirector(self.console)

        self.write('Welcome to MiCall v{}, running with {} threads.\n'.format(
            pipeline_version, self.nthreads))
Exemple #18
0
def main():
    logger.info("Starting on %s with %d CPU's.", socket.gethostname(),
                multiprocessing.cpu_count())
    args = parse_args()
    if args.link_run is not None:
        run_json = link_json(args.link_run, args.data_path)
        run_json.has_runinfo = True
    else:
        json_path = os.path.join(args.data_path, 'input', 'AppSession.json')
        try:
            with open(json_path, 'r') as json_file:
                run_json = parse_json(json_file)
        except:
            if os.path.exists(json_path):
                # copy the input file to the output dir for postmortem analysis
                logger.error("Error occurred while parsing '%s'" % json_path)
                with open(json_path, 'r') as json_file:
                    file_cont = json_file.read()
                out_path = os.path.join(args.data_path, 'logs',
                                        'AppSession.json')
                with open(out_path, 'w') as json_file:
                    json_file.write(file_cont)
            else:
                logger.error("Error: no such file as '%s'" % json_path)
            raise
        # Do we have run_ids for all sample_ids ?
        if run_json.run_id is None:
            run_json.has_runinfo = False
        else:
            bs = BSrequest()
            sample_id_set = bs.check_run_sample_ids(
                [run_json.run_id], [s["Id"] for s in run_json.samples])
            run_json.has_runinfo = (len(sample_id_set) == len(
                run_json.samples))
        logger.info("setting json.has_run_info to %s" % run_json.has_runinfo)
    pssm = Pssm()

    scratch_path = os.path.join(args.data_path, 'scratch')
    makedirs(scratch_path)
    for filename in os.listdir(scratch_path):
        filepath = os.path.join(scratch_path, filename)
        if os.path.isdir(filepath):
            shutil.rmtree(filepath)
        else:
            os.remove(filepath)
    args.g2p_path = args.qc_path = create_app_result(args.data_path,
                                                     run_json,
                                                     suffix='results')
    if run_json.run_id is None:
        run_summary = None
    else:
        logger.info('Summarizing run.')
        run_summary = summarize_run(args, run_json)

    pool = Pool()
    pool.map(
        functools.partial(try_sample, run_info=run_json, args=args, pssm=pssm),
        range(len(run_json.samples)))

    pool.close()
    pool.join()
    collate_samples(args, run_json)
    if run_json.run_id is not None:
        summarize_samples(args, run_json, run_summary)
    logger.info('Done.')