Ejemplo n.º 1
0
    def start(self):
        status_message('Multiple alignment running','please wait')
        args = self.args
        executor = concurrent.futures.ProcessPoolExecutor(args['n'])
        try:
            for i in range(self.num_runs):
                if self.debug >= 1:
                    print("DomainExtractionDriver: Run "+str(i+1)+" of "+str(self.num_runs))
                targets_sub = generate_sequence_set(self.targets,args['subsample'],args['random_subset'],args['random_order'],args['subsample_start'])
                f = executor.submit(_extract_domains, targets_sub, is_baseline=False,
                                    input_state=self.input_state, threshold=args['thresh'],
                                    thresh_type=args['type'], max_domain_size=args['win'], min_domain_size=args['minwin'])
                f.add_done_callback(self._callback)

                disallowed = []
                if args['disjoint_subset']:
                    disallowed = targets_sub
                baselines_sub = generate_sequence_set(self.baselines,args['subsample'],args['random_subset'],args['random_order'],args['subsample_start'],disallowed=disallowed)
                f = executor.submit(_extract_domains, baselines_sub, is_baseline=True,
                                    input_state=self.input_state, threshold=args['thresh'],
                                    thresh_type=args['type'], max_domain_size=args['win'], min_domain_size=args['minwin'])
                f.add_done_callback(self._callback)

            executor.shutdown()
            #self.close_output_buffers()
            status_message('Analysis complete', 'OK')
        except KeyboardInterrupt:
            executor.shutdown()
Ejemplo n.º 2
0
def generate_consensus_newick(msa, consensus, filename):
    newick_string = ""
    cons_str = consensus.consensus
    conservation, variability = calculate_conservation(msa, consensus)
    a_stack = [(1, 0)]
    print(cons_str)
    for position in range(len(cons_str)):
        # print(str(position)+' '+cons_str[position])
        # print(str(a_stack))
        if cons_str[position] == "A":
            a_stack.append((0, 0))
            node_string = ")"
        elif cons_str[position] == "C":
            node_string = ",:1)"
            a_val = a_stack.pop()
            a_stack.append((a_val[0], a_val[1] + 1))
        elif cons_str[position] == "T":
            node_string = "(:1,:1)"
            not_done = 1
            while not_done and len(a_stack) > 0:
                a_val = a_stack.pop()
                if a_val[0] == 0:
                    node_string = "," + a_val[1] * "(" + node_string
                    a_stack.append((1, 1))
                    not_done = 0
                else:
                    node_string = a_val[1] * "(" + node_string

        newick_string = (
            node_string
            + cons_str[position]
            + "-"
            + str(round(conservation[position], 3))
            + ":"
            + str(round(variability[position], 3))
            + newick_string
        )

    newick_string += ";"

    status_message("Generating newick string", "OK")
    handle = open(args["newick"], "w")
    handle.write(newick_string)
    handle.close()
    return newick_string
Ejemplo n.º 3
0
def extract_and_run_stats(args):
    msa = XMLBuildReader(args['build']).parse()
    consensus = msa.consensuses[0]
    ungapped = consensus.ungapped_consensus
    alignments = msa.alignments

    contributions = []
    for alignment in alignments:
        contributions.append(1/len(alignment.replace('-','')))

    current_var_sum = 0
    variability_sums = []
    conservation = []
    # Go through each position in the alignment (based on the composite or gapped consensus)
    for position in range(len(msa.composite)):
        # If the current position is a gap, add contributions to the current variability sum
        if consensus.seq[position] == '-':
            # Go through each alignment to check whether it has a character at the current position
            for alignment_num in range(len(alignments)):
                # If it does have a character (not a gap '-'), add its variability contribution
                if not alignments[alignment_num][position] == '-':
                    current_var_sum += contributions[alignment_num]
        # The current position is a consensus character, so store the variability sum preceeding it
        else:
            # if the position within the ungapped consensus is needed, just use the current length of variability_sums
            variability_sums.append(current_var_sum)
            current_var_sum = 0

            # determine the current position's conservation level
            conservation_sum = 0
            for alignment_num in range(len(alignments)):
                # If it does have a character (not a gap '-'), add its conservation contribution
                if not alignments[alignment_num][position] == '-':
                    conservation_sum += 1
            conservation.append(conservation_sum/len(alignments))

    if 'newick' in args.keys():
        status_message('Generating newick string','OK')
        newick_string = generate_consensus_newick(consensus,conservation,variability_sums)
        handle = open(args['newick'],'w')
        handle.write(newick_string)
        handle.close()
Ejemplo n.º 4
0
            msa.add_consensus(threshold, msa.build_consensus(float(threshold)))
            print("Consensus at threshold " + threshold + ": " + msa.get_consensus(threshold).consensus)

        if args["newick"]:
            if args["threshold"]:
                generate_consensus_newick(msa, msa.get_consensus(float(args["threshold"])), args["newick"])
            else:
                generate_consensus_newick(msa, list(msa.consensuses.values())[0], args["newick"])

        if args["scores"]:
            generate_score_and_conserved_chars_file(msa, args["scores"])

        if args["k"]:
            threshold, k = msa.find_conservation_boundary()
            print(
                "At threshold %.2f, average conservation and proportion of conserved characters are both %.2f%%"
                % (threshold, k * 100)
            )

        if args["newbuild"]:
            #            consensus_object = msa.build_consensus(args['threshold'],args['type'])

            # Write MSA and consensus to file
            consensus_fact = ConsensusFilterFactory(msa, msa.get_consensus(threshold))
            consensus_fact.write(fname=args["newbuild"])

        status_message("Consensus statistics computation complete ", "OK")

    except (IOError, KeyboardInterrupt, IndexError) as e:
        print(str(e))
Ejemplo n.º 5
0
        elif cons_str[position] == 'T':
            node_string = '(:1,:1)'
            not_done = 1
            while not_done and len(a_stack) > 0:
                a_val = a_stack.pop()
                if a_val[0] == 0:
                    node_string = ','+a_val[1]*'('+node_string
                    a_stack.append((1,1))
                    not_done = 0
                else:
                    node_string = a_val[1]*'('+node_string
            
        newick_string = node_string + cons_str[position] + '-' + str(round(conservation[position],3)) + ':' + str(round(variability_sums[position],3)) + newick_string

    newick_string += ';'
    return newick_string

if __name__ == '__main__':
    try:
        args = ConsensusStatsCommandParser().parse_args()
#        ConsensusStatsArgumentValidator(args) # test all arguments are correct

        print('capellini - v.' + str(version) + '\n=============')

        extract_and_run_stats(args)

        status_message('Consensus statistics computation complete ', 'OK')

    except (IOError, KeyboardInterrupt, IndexError) as e:
        print(str(e))
Ejemplo n.º 6
0
        if args['mode'] == 'single':
            print('penne - v.' + str(version) + '\n=============')
            cons_query = XMLBuildReader(args['query']).parse().consensuses[0]
            cons_baseline = XMLBuildReader(args['baseline']).parse().consensuses[0]
        
            # next, yield domains for both query and baseline datasets. 
            dsb_query = DomainSetBuilder(cons_query, args['win'], args['max_g'],
                         args['strip'], is_enum=args['enumerate'])
            dsb_baseline = DomainSetBuilder(cons_baseline, args['win'], args['max_g'],
                         args['strip'], is_enum=args['enumerate'])
#            dsb_baseline = DomainSetBuilder(win=args['win'], max_gap=args['max_g'], 
#                         is_enum=args['enumerate'], consensus=cons_baseline,
#                         is_strip=args['strip'])
            domains_query = dsb_query.build() # build abundance counts
            domains_baseline = dsb_baseline.build()
            status_message('Identifying domains', 'OK')
            db = DomainAbundanceBuilder(query=domains_query, baseline=domains_baseline)
            domains = db.build() # build contingency matrices 
            dpp = DomainPrettyPrinter(domains = domains, pval = args['p'],
                                  out=args['o'])
            dpp.display() # pretty-print domains
            status_message('Domain over-representation computation complete ', 'OK')
        else:
            args.update({'f':args['query'],'f2':args['baseline'],'a':None})
            input_state = InputWrapperState(args)
            #input_state.assign_matrix() # parse in-built or custom matrix
            targets = input_state.parse_fasta(input_state.fname)
            baselines = input_state.parse_fasta(input_state.fname2)
            if not args['overlap']:
                target_names = list([target.name for target in targets])
                baselines = list([baseline for baseline in baselines if baseline.name not in target_names])
Ejemplo n.º 7
0
            cons_query = XMLBuildReader(args["query"]).parse().consensuses[0]
            cons_baseline = XMLBuildReader(args["baseline"]).parse().consensuses[0]

            # next, yield domains for both query and baseline datasets.
            dsb_query = DomainSetBuilder(
                cons_query, args["win"], args["max_g"], args["strip"], is_enum=args["enumerate"]
            )
            dsb_baseline = DomainSetBuilder(
                cons_baseline, args["win"], args["max_g"], args["strip"], is_enum=args["enumerate"]
            )
            #            dsb_baseline = DomainSetBuilder(win=args['win'], max_gap=args['max_g'],
            #                         is_enum=args['enumerate'], consensus=cons_baseline,
            #                         is_strip=args['strip'])
            domains_query = dsb_query.build()  # build abundance counts
            domains_baseline = dsb_baseline.build()
            status_message("Identifying domains", "OK")
            db = DomainAbundanceBuilder(query=domains_query, baseline=domains_baseline)
            domains = db.build()  # build contingency matrices
            dpp = DomainPrettyPrinter(domains=domains, pval=args["p"], out=args["o"])
            dpp.display()  # pretty-print domains
            status_message("Domain over-representation computation complete ", "OK")
        else:
            args.update({"f": args["query"], "f2": args["baseline"], "a": None})
            input_state = InputWrapperState(args)
            # input_state.assign_matrix() # parse in-built or custom matrix
            targets = input_state.parse_fasta(input_state.fname)
            baselines = input_state.parse_fasta(input_state.fname2)
            if not args["overlap"]:
                target_names = list([target.name for target in targets])
                baselines = list([baseline for baseline in baselines if baseline.name not in target_names])