Ejemplo n.º 1
0
def tab_output_equal(csvfile=None, jsonfile=None, pdfile=None, fastafile=None, fastastructures=None, ref_seqs_file=None):
    c = None
    j = None
    p = None
    ff = None
    fs = None
    if csvfile is not None:
        c = pd.read_csv(csvfile)['best_sequence'].tolist()
    if jsonfile is not None:
        with open(jsonfile, 'r') as f:
            j = blastsearchrecomputefromdict(json.load(f)).hits
            j = [i.extension for i in j]
            j = [str(i.seq) for i in j]
    if pdfile is not None:
        p = pd.read_pickle(pdfile)['best_sequence'].tolist()
    if fastafile is not None:
        with open(fastafile, 'r') as f:
            ff = [str(i.seq) for i in SeqIO.parse(f, format='fasta')]
    if fastastructures is not None:
        fs = [i for i in parse_named_structure_file(fastastructures)]
        fs = [str(i.seq) for i in fs]

    outputs = [c, j, p, ff, fs]
    outputs = [i for i in outputs if i is not None]

    if ref_seqs_file is not None:
        json_file = ref_seqs_file

        f = open(json_file, 'r')
        mydata = json.load(f)
        f.close()
        bb = blastsearchrecomputefromdict(mydata).hits
        bb = [i.extension for i in bb]
        bb = [str(i.seq) for i in bb]

        outputs += [bb]

    # check length
    if not all(len(i) == len(outputs[0]) for i in outputs):
        raise AssertionError(
            'All output files have not same length'
        )

    # check sequences (all outputs should have same output sequences)
    for ll in zip(*outputs):
        if not all(i == ll[0] for i in ll):
            raise AssertionError(
                'All output sequences should be same.'
            )
    return True
Ejemplo n.º 2
0
    def test_output_with_sequence_fail(self):
        f = open(self.json_file, 'r')
        mydata = json.load(f)
        f.close()
        bb = convert_classes.blastsearchrecomputefromdict(mydata)
        hit = bb.hits.pop(1)
        hit.extension = None
        bb.hits_failed.append(hit)
        with open(self.htmlo, 'wb') as h:
            h.write(write_html_output(bb))

        try:
            with open(self.htmlo, 'rb') as f, open(
                os.path.abspath(
                    os.path.dirname(__file__) + '/test_data/RF00001_reference_missing_hit.html.md5'
                )
            ) as r:
                self.assertEqual(
                    hashlib.md5(f.read()).hexdigest(),
                    r.read()
                )
        finally:
            try:
                os.remove(self.htmlo)
            except:
                print('removing temporary test files failed')
def prepare_new_htmlout():
    json_file = os.path.abspath(
        os.path.dirname(__file__) + '/../RF00001_output.json')
    f = open(json_file, 'r')
    mydata = json.load(f)
    bb = convert_classes.blastsearchrecomputefromdict(mydata)
    fd, html_file = tempfile.mkstemp(prefix='rba_', suffix='_t30')
    os.close(fd)

    with open(html_file, 'wb') as h:
        h.write(write_html_output(bb))

    target = os.path.abspath(
        os.path.dirname(__file__) + '/../RF00001_reference_output.html.md5')
    with open(html_file, 'rb') as f, open(target, 'w') as t:
        t.write(hashlib.md5(f.read()).hexdigest())

    os.remove(html_file)

    bb.hits[1].extension = None
    with open(html_file, 'wb') as h:
        h.write(write_html_output(bb))

    target = os.path.abspath(
        os.path.dirname(__file__) +
        '/../RF00001_reference_missing_hit.html.md5')
    with open(html_file, 'rb') as f, open(target, 'w') as t:
        t.write(hashlib.md5(f.read()).hexdigest())
    os.remove(html_file)
Ejemplo n.º 4
0
 def setUp(self):
     self.json_file = os.path.abspath(
         os.path.dirname(__file__) + '/test_data/RF00001_output.json')
     f = open(self.json_file, 'r')
     mydata = json.load(f)
     f.close()
     self.data = blastsearchrecomputefromdict(mydata)
Ejemplo n.º 5
0
    def test_blastrecompute_with_blast_data(self):
        # load blast data
        blast_outputs = []
        with open(
                os.path.join(fwd, 'test_data',
                             'blast_parse_hits_txt_standalone.txt'), 'r') as f:
            for r in blast_parse_txt(f):
                blast_outputs.append(r)

        q = SeqRecord(Seq('ACGUTGU'), id='qq')
        s = BlastSearchRecompute(None, q, 0)
        h = HitList()
        a = Subsequences(
            SeqRecord(Seq('ACGUTGU'),
                      id='aa',
                      annotations={
                          'blast': (0, blast_outputs[0].alignments[0].hsps[0])
                      }))
        b = Subsequences(
            SeqRecord(Seq('ACGAUCGUGAC'),
                      id='bb',
                      annotations={
                          'blast': (1, blast_outputs[0].alignments[0].hsps[1])
                      }))
        h.append(a)
        h.append(b)
        s.hits = h
        s.query = SeqRecord(Seq('ACGUGUGCA'), id='query')
        s.args = Namespace(**{'aa': 'asdq', 'bb': 'acoi'})
        encoded = convert_classes.blastsearchrecompute2dict(s)
        encoded_json = json.dumps(encoded)
        encoded = json.loads(encoded_json)
        decoded = convert_classes.blastsearchrecomputefromdict(encoded)
        tc.recrusive_compare(s, decoded)
Ejemplo n.º 6
0
def tab_output_equal_structures(csvfile=None, jsonfile=None, pdfile=None, fastastructures=None):

    names = method_required_tools.keys()

    cc = None
    jj = None
    pp = None
    fsfs = None
    if csvfile is not None:
        c = pd.read_csv(csvfile)
        cc = dict()
        for n in names:
            if n in c.columns:
                cc[n] = c[n].tolist()
    if jsonfile is not None:
        with open(jsonfile, 'r') as f:
            j = blastsearchrecomputefromdict(json.load(f)).hits
            j = [i.extension for i in j]
            jj = dict()
            for s in j:
                for key in s.letter_annotations.keys():
                    if key not in jj:
                        jj[key] = []
                    jj[key].append(s.letter_annotations[key])
    if pdfile is not None:
        p = pd.read_pickle(pdfile)
        pp = dict()
        for n in names:
            if n in p.columns:
                pp[n] = p[n].tolist()
    if fastastructures is not None:
        fs = [i for i in parse_named_structure_file(fastastructures)]
        assert all([len(fs[0].letter_annotations) == k for k in [len(i.letter_annotations) for i in fs]])
        fsfs = dict()
        for s in fs:
            for key in s.letter_annotations.keys():
                if key not in fsfs:
                    fsfs[key] = []
                fsfs[key].append(s.letter_annotations[key])

    outputs = [cc, jj, pp, fsfs]
    outputs = [i for i in outputs if i is not None]

    # check length
    if not all(len(i) == len(outputs[0]) for i in outputs):
        raise AssertionError(
            'All output files have not same length'
        )

    # check structures (all outputs should have same output structures)
    keys = outputs[0].keys()
    for k in keys:
        for ss in zip(*[ll[k] for ll in outputs]):
            if not all(i == ss[0] for i in ss):
                raise AssertionError(
                    'All output structures should be same.'
                )
    return True
Ejemplo n.º 7
0
    def setUp(self):
        bixml = os.path.abspath(
            os.path.dirname(__file__) + '/test_data/web_multi_hit.xml')
        with open(bixml, 'r') as b:
            self.blast = blast_hsps2list([i for i in NCBIXML.parse(b)][0])

        jfile = os.path.abspath(
            os.path.dirname(__file__) + '/test_data/RF00001_output.json')
        with open(jfile, 'r') as j:
            self.res = blastsearchrecomputefromdict(json.load(j))
Ejemplo n.º 8
0
    def setUp(self):
        f = open(ref_json_file, 'r')
        mydata = json.load(f)
        f.close()
        bb = convert_classes.blastsearchrecomputefromdict(mydata)
        self.data = bb

        ff, csv = tempfile.mkstemp(prefix='rba_', suffix='_t1')
        os.close(ff)
        self.csv = csv

        ff, html = tempfile.mkstemp(prefix='rba_', suffix='_t2')
        os.close(ff)
        self.html = blast_query + 'test_html.html'

        ff, pandas_dump = tempfile.mkstemp(prefix='rba_', suffix='_t3')
        os.close(ff)
        self.pandas_dump = pandas_dump

        ff, json_file = tempfile.mkstemp(prefix='rba_', suffix='_t4')
        os.close(ff)
        self.json = json_file

        ff, fasta = tempfile.mkstemp(prefix='rba_', suffix='_t5')
        os.close(ff)
        self.fasta = fasta

        ff, allHits_fasta = tempfile.mkstemp(prefix='rba_', suffix='_t6')
        with os.fdopen(ff, 'w') as f:
            SeqIO.write([i.extension for i in self.data.hits], f, 'fasta')

        self.fasta_structures = allHits_fasta

        self.args = Pseudoargs(
            blast_query,
            blast_in,
            blast_db,
            b_type='plain',
            prediction_method=['rnafold'],
            blast_regexp=r'(?<=\|)[A-Z0-9]*\.?\d*$',
            enable_overwrite=True,
            html=self.html,
        )

        self.func_args = {
            'query': self.data.query,
            'seqs2predict_fasta': allHits_fasta,
            'pred_method_params': {},
            'all_hits_list': [i.extension for i in self.data.hits],
            'seqs2predict_list': [i.extension for i in self.data.hits],
            'use_cm_file': 'abc',
        }
        remove_files_with_try([blast_in + '.tmp_rboAnalyzer'])
Ejemplo n.º 9
0
 def test_blastrecompute(self):
     q = SeqRecord(Seq('ACGUTGU'), id='qq')
     s = BlastSearchRecompute(None, q, 0)
     h = HitList()
     a = Subsequences(SeqRecord(Seq('ACGUTGU'), id='aa'))
     b = Subsequences(SeqRecord(Seq('ACGAUCGUGAC'), id='bb'))
     h.append(a)
     h.append(b)
     s.hits = h
     s.query = SeqRecord(Seq('ACGUGUGCA'), id='query')
     s.args = Namespace(**{'aa': 'asdq', 'bb': 'acoi'})
     encoded = convert_classes.blastsearchrecompute2dict(s)
     encoded_json = json.dumps(encoded)
     encoded = json.loads(encoded_json)
     decoded = convert_classes.blastsearchrecomputefromdict(encoded)
     tc.recrusive_compare(s, decoded)
Ejemplo n.º 10
0
    def test_continuation2(self):
        with open(blast_output, 'r') as f, open(self.test_backup_file,
                                                'w') as ff:
            data = blastsearchrecomputefromdict(json.load(f))
            data.args.blast_in = blast_in
            data.args.json = None
            data.args.html = test_output_file
            data.args.sha1 = self.sha1
            data.args.prediction_method += ['centroid']

            new_structures = {'rnafold': []}
            for h in data.hits:
                n = copy(h.extension)
                n.letter_annotations['ss0'] = n.letter_annotations['rnafold']
                del n.letter_annotations['rnafold']

                new_structures['rnafold'].append(n)

            json.dump([blastsearchrecompute2dict(data)], ff, indent=2)

        out = lunch_with_args(self.args)
        self.assertEqual(1, 1)

        # test_output
        for i in range(len(out)):
            out[0].to_csv(self.csv)
            j_obj = json.dumps(blastsearchrecompute2dict(out[0]), indent=2)
            with open(self.json, 'w') as ff:
                ff.write(j_obj)
            out[0].write_results_fasta(self.fasta)
            out[0].write_results_structures(self.fasta_structures)

            t = tab_output_equal(csvfile=self.csv,
                                 jsonfile=self.json,
                                 fastafile=self.fasta,
                                 fastastructures=self.fasta_structures,
                                 ref_seqs_file=os.path.join(
                                     fwd, test_data_dir, 'simple.json'))
            self.assertEqual(t, True)
Ejemplo n.º 11
0
def lunch_computation(args_inner, shared_list=None):
    ml.debug(fname())
    if not shared_list:
        shared_list = []

    # update params if different config is requested
    CONFIG.override(tools_paths(args_inner.config_file))

    p_blast = BA_support.blast_in(args_inner.blast_in, b=args_inner.b_type)
    query_seqs = [i for i in SeqIO.parse(args_inner.blast_query, 'fasta')]

    if len(p_blast) != len(query_seqs):
        ml.error(
            'Number of query sequences in provided BLAST output file ({}) does not match number of query sequences'
            ' in query FASTA file ({}).'.format(len(p_blast), len(query_seqs)))
        sys.exit(1)

    # check if BLAST does not contain unexpected sequence characters
    validate_args.check_blast(p_blast)

    # create list of correct length if needed
    all_saved_data = [None] * len(query_seqs)
    saved_file = '{}.r-{}'.format(args_inner.blast_in, args_inner.sha1[:10])
    with open(saved_file, 'r+') as f:
        _saved = json.load(f)
        if _saved is None:
            f.seek(0)
            f.truncate()
            json.dump(all_saved_data, f)
        else:
            msg = "Loading backup data."
            print('STATUS: ' + msg)
            ml.info(msg + ' file: ' + saved_file)
            all_saved_data = _saved

            for saved_data in all_saved_data:
                # we can have partially computed data
                if saved_data is None:
                    continue
                if saved_data['args']['sha1'] != args_inner.sha1:
                    msg = "Input argument hash does not match the saved argument hash. "
                    if saved_data['args']['sha1'][:10] == args_inner.sha1[:10]:
                        msg += "This is because of truncating hashes to first 10 characters. "
                        msg += "Please remove the '{}' file.".format(
                            saved_file)
                        ml.error(msg)
                        sys.exit(1)
                    else:
                        msg += "Please remove the '{}' file.".format(
                            saved_file)
                        sys.exit(1)

    if len(p_blast) > 1:
        multi_query = True
    else:
        multi_query = False

    # this is done for each query
    ml_out_line = []
    all_analyzed = []
    for iteration, (bhp, query, saved_data) in enumerate(
            zip(p_blast, query_seqs, all_saved_data)):
        if saved_data is None:
            print('STATUS: processing query: {}'.format(query.id))
            validate_args.verify_query_blast(blast=bhp, query=query)

            analyzed_hits = BlastSearchRecompute(args_inner, query, iteration)
            analyzed_hits.multi_query = multi_query

            # run cm model build
            # allows to fail fast if rfam was selected and we dont find the model
            ih_model, analyzed_hits = find_and_extract_cm_model(
                args_inner, analyzed_hits)

            # select all
            all_blast_hits = BA_support.blast_hsps2list(bhp)

            if len(all_blast_hits) == 0:
                ml.error('No hits found in {} - {}. Nothing to do.'.format(
                    args_inner.blast_in, bhp.query))
                continue

            # filter if needed
            if args_inner.filter_by_eval is not None:
                tmp = filter_by_eval(all_blast_hits,
                                     BA_support.blast_hit_getter_from_hits,
                                     args_inner.filter_by_eval)
                if len(tmp) == 0 and len(all_blast_hits) != 0:
                    ml.error(
                        'The requested filter removed all BLAST hits {} - {}. Nothing to do.'
                        .format(args_inner.blast_in, bhp.query))
                    continue
            elif args_inner.filter_by_bitscore is not None:
                tmp = filter_by_bits(all_blast_hits,
                                     BA_support.blast_hit_getter_from_hits,
                                     args_inner.filter_by_bitscore)
                if len(tmp) == 0 and len(all_blast_hits) != 0:
                    ml.error(
                        'The requested filter removed all BLAST hits {} - {}. Nothing to do.'
                        .format(args_inner.blast_in, bhp.query))
                    continue

            all_short = all_blast_hits

            # now this is different for each mode
            if args_inner.mode == 'simple':
                analyzed_hits, homology_prediction, homol_seqs, cm_file_rfam_user = extend_simple_core(
                    analyzed_hits, query, args_inner, all_short, multi_query,
                    iteration, ih_model)
            elif args_inner.mode == 'locarna':
                analyzed_hits, homology_prediction, homol_seqs, cm_file_rfam_user = extend_locarna_core(
                    analyzed_hits, query, args_inner, all_short, multi_query,
                    iteration, ih_model)
            elif args_inner.mode == 'meta':
                analyzed_hits, homology_prediction, homol_seqs, cm_file_rfam_user = extend_meta_core(
                    analyzed_hits, query, args_inner, all_short, multi_query,
                    iteration, ih_model)
            else:
                raise ValueError(
                    'Unknown option - should be cached by argparse.')

            if len(analyzed_hits.hits) == 0:
                ml.error(
                    "Extension failed for all sequences. Please see the error message. You can also try '--mode simple'."
                )
                sys.exit(1)

            analyzed_hits.copy_hits()

            with open(args_inner.blast_in + '.r-' + args_inner.sha1[:10],
                      'r+') as f:
                all_saved_data = json.load(f)
                all_saved_data[iteration] = blastsearchrecompute2dict(
                    analyzed_hits)
                f.seek(0)
                f.truncate()
                json.dump(all_saved_data, f, indent=2)

        else:
            print(
                'STATUS: extended sequences loaded from backup file for query {}'
                .format(query.id))
            analyzed_hits = blastsearchrecomputefromdict(saved_data)

            # overwrite the saved args with current
            # this will update used prediction methods and other non essential stuff
            analyzed_hits.args = args_inner

            if analyzed_hits.args.cm_file:
                cm_file_rfam_user = analyzed_hits.args.cm_file
            else:
                cm_file_rfam_user = None

        all_analyzed.append(analyzed_hits)

        # write all hits to fasta
        fda, all_hits_fasta = mkstemp(prefix='rba_',
                                      suffix='_22',
                                      dir=CONFIG.tmpdir)
        os.close(fda)
        analyzed_hits.write_results_fasta(all_hits_fasta)

        out_line = []
        # multiple prediction params
        if args_inner.dev_pred:
            dp_list = []
            # acomodate more dev pred outputs
            dpfile = None
            if getattr(args_inner, 'dump', False):
                dpfile = args_inner.dump.strip('dump')
            if getattr(args_inner, 'pandas_dump', False):
                dpfile = args_inner.pandas_dump.strip('pandas_dump')
            if getattr(args_inner, 'json', False):
                dpfile = args_inner.json.strip('json')

            # optimization so the rfam cm file is used only once
            if cm_file_rfam_user is None and 'rfam' in ''.join(
                    args_inner.prediction_method):
                best_model = get_cm_model(args_inner.blast_query,
                                          threads=args_inner.threads)
                rfam = RfamInfo()
                cm_file_rfam_user = run_cmfetch(rfam.file_path, best_model)

            for method in args_inner.prediction_method:
                # cycle the prediction method settings
                # get set of params for each preditcion
                selected_pred_params = [
                    kk for kk in args_inner.pred_params if method in kk
                ]
                shuffle(selected_pred_params)
                # for method_params in args_inner.pred_params:
                for i, method_params in enumerate(selected_pred_params):
                    ah = deepcopy(analyzed_hits)

                    random_flag = BA_support.generate_random_name(
                        8, shared_list)
                    shared_list.append(random_flag)

                    pname = re.sub(' ', '', str(method))
                    flag = '|pred_params|' + random_flag

                    # rebuild the args only with actualy used prediction settings
                    ah.args.prediction_method = method
                    ah.args.pred_params = method_params

                    if getattr(args_inner, 'dump', False):
                        spa = args_inner.dump.split('.')
                        ah.args.dump = '.'.join(
                            spa[:-1]) + flag + '.' + spa[-1]
                    if getattr(args_inner, 'pandas_dump', False):
                        spa = args_inner.pandas_dump.split('.')
                        ah.args.pandas_dump = '.'.join(
                            spa[:-1]) + flag + '.' + spa[-1]
                    if getattr(args_inner, 'pdf_out', False):
                        spa = args_inner.pdf_out.split('.')
                        ah.args.pdf_out = '.'.join(
                            spa[:-1]) + flag + '.' + spa[-1]
                    if getattr(args_inner, 'json', False):
                        spa = args_inner.json.split('.')
                        ah.args.json = '.'.join(
                            spa[:-1]) + flag + '.' + spa[-1]

                    wrapped_ending_with_prediction(
                        args_inner=ah.args,
                        analyzed_hits=ah,
                        pred_method=method,
                        method_params=method_params,
                        used_cm_file=cm_file_rfam_user,
                        multi_query=multi_query,
                        iteration=iteration,
                    )
                    success = True
                    out_line.append(to_tab_delim_line_simple(ah.args))

                    dp_list.append((i, method_params, success, flag, pname,
                                    random_flag, args_inner.pred_params))

            if dpfile is not None:
                with open(dpfile + 'devPredRep', 'wb') as devf:
                    pickle.dump(dp_list, devf)
        else:
            wrapped_ending_with_prediction(
                args_inner=args_inner,
                analyzed_hits=analyzed_hits,
                used_cm_file=cm_file_rfam_user,
                multi_query=multi_query,
                iteration=iteration,
            )
            out_line.append(to_tab_delim_line_simple(args_inner))

        ml_out_line.append('\n'.join(out_line))

        if cm_file_rfam_user is not None and os.path.exists(cm_file_rfam_user):
            BA_support.remove_one_file_with_try(cm_file_rfam_user)

        BA_support.remove_one_file_with_try(all_hits_fasta)
    return '\n'.join(ml_out_line), all_analyzed
Ejemplo n.º 12
0
 def setUp(self):
     with open(os.path.join(fwd, test_dir, 'RF00001_output.json'),
               'r') as ff:
         ll = json.load(ff)
         self._bsdata = convert_classes.blastsearchrecomputefromdict(ll)
         self.data = self._bsdata.copy()