def main(argv=None): cli_parser = bpn.cli.SaCli() input_data = cli_parser.parse_args(argv) logger.info("Constructing supporting data structures; this may " "take a while...") annotated_interactions = bpn.structures.AnnotatedInteractionsArray( input_data.interactions_graph, input_data.annotations_dict ) logger.info("Considering %d candidate links in total." % annotated_interactions.calc_num_links()) logger.info("Constructing Simulated Annealing") if input_data.free_parameters: logger.info("Using free parameter transitions.") parameters_state_class = states.RandomTransitionParametersState else: parameters_state_class = states.PLNParametersState if input_data.disable_swaps: logger.info("Disabling swap transitions.") links_state_class = states.NoSwapArrayLinksState else: links_state_class = states.ArrayLinksState if input_data.detailed_transitions: logger.info("Recording extra information for each state.") transitions_csvfile = convutils.make_csv_dict_writer( input_data.transitions_outfile, DETAILED_TRANSITIONS_FIELDNAMES ) else: transitions_csvfile = convutils.make_csv_dict_writer( input_data.transitions_outfile, TRANSITIONS_FIELDNAMES ) sa = simulatedannealing.ArraySimulatedAnnealing( annotated_interactions, input_data.activity_threshold, input_data.transition_ratio, num_steps=input_data.steps, temperature=input_data.temperature, end_temperature=input_data.end_temperature, parameters_state_class=parameters_state_class, links_state_class=links_state_class ) logger.info("Beginning to Anneal. This may take a while...") sa.run() logger.info("Run completed.") logger.info("Writing link results to %s" % input_data.links_outfile.name) links_out_csvwriter = convutils.make_csv_dict_writer( input_data.links_outfile, LINKS_FIELDNAMES) logger.info("Writing parameter results to %s" % ( input_data.parameters_outfile.name)) parameters_out_csvwriter = convutils.make_csv_dict_writer( input_data.parameters_outfile, PARAMETERS_FIELDNAMES) logger.info("Writing transitions data to %s." % ( input_data.transitions_outfile.name)) logger.info("Finished.")
def calculate_and_output_results_edge_swap( outfileh, pairs, total_pairs, interactions_graph, annotations_dict, num_permutations, num_edge_swap_events, use_estimation=True, score_correction=False ): """Calculates the significance of a link between each given pair of annotation terms using resampling of genes annotated by the second term. :Parameters: - `outfileh`: a file handle to a file for output - `pairs`: an iterable of pairs of annotation terms - `total_pairs`: the number of total annotation pairs to be processed - `interactions_graph`: graph containing the gene-gene or gene product-gene product interactions - `annotations_dict`: a dictionary with annotation terms as keys and `set`s of genes as values - `num_permutations`: maximum number of permutations to perform [NOTE: see `use_estimation`] - `num_edge_swap_events`: the number of edge swap events desired to produce each random graph. [NOTE: this number is multiplied by the number of edges in the `interactions_graph` to get the total number of edge swap events.] - `use_estimation`: estimate significances for pairs which are unlikely to have significant scores [default: `True`] [NOTE: using this option will not guarantee that the number of permutations specified by `num_permutations` will be performed.] - `score_correction`: if `True`, perform correction on scores using an expected value computed from the mean expression value [default: `False`] """ # Create the output CSV file. csv_writer = convutils.make_csv_dict_writer(outfileh, OUTFILE_FIELDS) pair_statistics = compute_significance_for_pairs_edge_swap( pairs, interactions_graph, annotations_dict, num_permutations, num_edge_swap_events, use_estimation, score_correction ) logger.info("Writing results to %s" % outfileh.name) write_results_to_csv(csv_writer, pair_statistics.iteritems())
def calculate_and_output_scores( interactions_graph, annotations_dict, links, num_links, links_outfile ): """Calculate and output the link scores. :Parameters: - `interactions_graph`: graph containing the gene-gene or gene product-gene product interactions - `annotations_dict`: a dictionary with annotation terms as keys and `set`s of genes as values - `links`: pairs of annotation terms of which to calculate link scores - `num_links`: the number of links contained in `links` - `links_outfile`: file for output of link results """ csv_writer = convutils.make_csv_dict_writer( links_outfile, OUTFILE_FIELDS) overlap_scores = [] for i, link_scores in enumerate( calculate_linkage_scores( interactions_graph, annotations_dict, links ) ): overlap_scores.append(link_scores) # periodically flush results to disk if not ((i + 1) % RESULTS_BUFFER_SIZE): percent_done = int( math.floor(100 * (i + 1) / float(num_links)) ) logger.info("%d of %d (%d%%) links processed. " "Writing to %s." % (i + 1, num_links, percent_done, links_outfile.name) ) csv_writer.writerows(overlap_scores) # flush the scores overlap_scores = [] logger.info("%d of %d (100%%) links processed." % (i + 1, num_links)) logger.info("Writing to %s" % links_outfile.name) csv_writer.writerows(overlap_scores)
def main(argv=None): starting_time = datetime.datetime.now() cli_parser = bpn.cli.McmcCli() input_data = cli_parser.parse_args(argv) logger.info("Constructing supporting data structures; this may " "take a while...") if input_data.terms_based: annotated_interactions = ( bpn.structures.AnnotatedInteractions2dArray( input_data.interactions_graph, input_data.annotations_dict, stringent_coannotations=input_data.stringent_coannotations ) ) else: annotated_interactions = ( bpn.structures.AnnotatedInteractionsArray( input_data.interactions_graph, input_data.annotations_dict, stringent_coannotations=input_data.stringent_coannotations ) ) # Check to see whether the potential links form a single connected # component. check_link_components(annotated_interactions) # TODO: check a command line option to see if the user input a seed; # for now, we'll just generate one all the time and report it. random_seed = create_seed_value() logger.info("The random seed value for this run is {0}.".format( random_seed)) random.seed(random_seed) logger.info("Constructing the Markov chain.") # Prepare the CSV writers for the state recorder. links_out_csvwriter = convutils.make_csv_dict_writer( input_data.links_outfile, LINKS_FIELDNAMES) parameters_out_csvwriter = convutils.make_csv_dict_writer( input_data.parameters_outfile, PARAMETERS_FIELDNAMES) if input_data.terms_based: terms_out_csvwriter = convutils.make_csv_dict_writer( input_data.terms_outfile, TERMS_FIELDNAMES) # Present the seed_links as indices. if input_data.seed_links: seed_links = [annotated_interactions.get_link_index(*link) for link in input_data.seed_links] else: seed_links = None # Choose the appropriate parameters class. if input_data.fixed_distributions: logger.info("Using fixed distributions for all parameters.") if input_data.terms_based: parameters_state_class = states.FixedTermPriorParametersState else: parameters_state_class = ( states.FixedDistributionParametersState) elif input_data.terms_based: parameters_state_class = states.TermPriorParametersState elif input_data.free_parameters: logger.info("Using free parameter transitions.") parameters_state_class = states.RandomTransitionParametersState else: parameters_state_class = states.PLNParametersState if input_data.terms_based: logger.info("Using terms-based model.") if input_data.independent_terms or input_data.genes_based: if input_data.seed_terms: seed_terms = [ annotated_interactions.get_term_index(term) for term in input_data.seed_terms ] else: seed_terms = None #else: #seed_terms = None if input_data.genes_based: if input_data.detailed_transitions: transitions_out_csvwriter = convutils.make_csv_dict_writer( input_data.transitions_outfile, GENES_BASED_TRANSITIONS_FIELDNAMES ) state_recorder = ( recorders.DetailedGenesBasedStateRecorder( annotated_interactions, parameters_out_csvwriter, links_out_csvwriter, terms_out_csvwriter, transitions_out_csvwriter ) ) else: transitions_out_csvwriter = convutils.make_csv_dict_writer( input_data.transitions_outfile, TRANSITIONS_FIELDNAMES ) state_recorder = recorders.TermsBasedStateRecorder( annotated_interactions, parameters_out_csvwriter, links_out_csvwriter, terms_out_csvwriter, transitions_out_csvwriter ) logger.info("Assessing term overlap through genes.") markov_chain = chains.GenesBasedMarkovChain( state_recorder, input_data.burn_in, input_data.steps, annotated_interactions, input_data.activity_threshold, transition_type_ratio=input_data.transition_ratio, seed_terms_indices=seed_terms, seed_links_indices=seed_links, link_false_pos=input_data.link_false_pos, link_false_neg=input_data.link_false_neg, link_prior=input_data.link_prior, term_false_pos=input_data.term_false_pos, term_false_neg=input_data.term_false_neg, term_prior=input_data.term_prior, ) else: if input_data.independent_terms: if input_data.detailed_transitions: transitions_out_csvwriter = ( convutils.make_csv_dict_writer( input_data.transitions_outfile, INDEPENDENT_TERMS_BASED_TRANSITIONS_FIELDNAMES ) ) state_recorder = ( recorders.DetailedIndependentTermsBasedStateRecorder( annotated_interactions, parameters_out_csvwriter, links_out_csvwriter, terms_out_csvwriter, transitions_out_csvwriter ) ) else: transitions_out_csvwriter = ( convutils.make_csv_dict_writer( input_data.transitions_outfile, TRANSITIONS_FIELDNAMES ) ) state_recorder = ( recorders.TermsBasedStateRecorder( annotated_interactions, parameters_out_csvwriter, links_out_csvwriter, terms_out_csvwriter, transitions_out_csvwriter ) ) logger.info("Using independent-terms model.") markov_chain = chains.IndependentTermsBasedMarkovChain( state_recorder, input_data.burn_in, input_data.steps, annotated_interactions, input_data.activity_threshold, transition_type_ratio=input_data.transition_ratio, seed_terms_indices=seed_terms, seed_links_indices=seed_links, link_false_pos=input_data.link_false_pos, link_false_neg=input_data.link_false_neg, link_prior=input_data.link_prior, term_prior=input_data.term_prior, parameters_state_class=parameters_state_class ) else: if input_data.detailed_transitions: transitions_out_csvwriter = ( convutils.make_csv_dict_writer( input_data.transitions_outfile, TERMS_BASED_TRANSITIONS_FIELDNAMES ) ) state_recorder = ( recorders.DetailedTermsBasedStateRecorder( annotated_interactions, parameters_out_csvwriter, links_out_csvwriter, terms_out_csvwriter, transitions_out_csvwriter ) ) else: transitions_out_csvwriter = ( convutils.make_csv_dict_writer( input_data.transitions_outfile, TRANSITIONS_FIELDNAMES ) ) state_recorder = ( recorders.TermsBasedStateRecorder( annotated_interactions, parameters_out_csvwriter, links_out_csvwriter, terms_out_csvwriter, transitions_out_csvwriter ) ) if input_data.intraterms: logger.info("Considering intra-term interactions.") links_state_class = states.IntraTermsAndLinksState else: links_state_class = states.TermsAndLinksState markov_chain = chains.TermsBasedMarkovChain( state_recorder, input_data.burn_in, input_data.steps, annotated_interactions, input_data.activity_threshold, transition_type_ratio=input_data.transition_ratio, seed_links_indices=seed_links, link_false_pos=input_data.link_false_pos, link_false_neg=input_data.link_false_neg, link_prior=input_data.link_prior, term_prior=input_data.term_prior, parameters_state_class=parameters_state_class, links_state_class=links_state_class, ) else: if input_data.disable_swaps: logger.info("Disabling swap transitions.") links_state_class = states.NoSwapArrayLinksState else: links_state_class = states.ArrayLinksState if input_data.detailed_transitions: logger.info("Recording extra information for each state.") transitions_out_csvwriter = convutils.make_csv_dict_writer( input_data.transitions_outfile, DETAILED_TRANSITIONS_FIELDNAMES ) if input_data.record_frequencies: logger.info("Recording frequency information for each " "state.") state_recorder = recorders.FrequencyDetailedArrayStateRecorder( annotated_interactions, parameters_out_csvwriter, links_out_csvwriter, transitions_out_csvwriter ) else: state_recorder = recorders.DetailedArrayStateRecorder( annotated_interactions, parameters_out_csvwriter, links_out_csvwriter, transitions_out_csvwriter ) else: transitions_out_csvwriter = convutils.make_csv_dict_writer( input_data.transitions_outfile, TRANSITIONS_FIELDNAMES, # TODO: This is a hack to force # FrequencyDetailedArrayStateRecorder to work # without the details transitions flag extrasaction="ignore" ) if input_data.record_frequencies: logger.info("Recording frequency information for each " "state.") state_recorder = recorders.FrequencyDetailedArrayStateRecorder( annotated_interactions, parameters_out_csvwriter, links_out_csvwriter, transitions_out_csvwriter ) else: state_recorder = recorders.ArrayStateRecorder( annotated_interactions, parameters_out_csvwriter, links_out_csvwriter, transitions_out_csvwriter ) markov_chain = chains.ArrayMarkovChain( state_recorder, input_data.burn_in, input_data.steps, annotated_interactions, input_data.activity_threshold, transition_type_ratio=input_data.transition_ratio, seed_links_indices=seed_links, link_false_pos=input_data.link_false_pos, link_false_neg=input_data.link_false_neg, link_prior=input_data.link_prior, parameters_state_class=parameters_state_class, links_state_class=links_state_class, ) logger.debug("""\ Chain information: Chain class: {chain.__class__} Overall class: {chain.current_state.__class__} Links class: {chain.current_state.links_state.__class__} Parameters class: {chain.current_state.parameters_state.__class__}\ """.format(chain=markov_chain)) logger.info("Beginning to run through states in the chain. This " "may take a while...") markov_chain.run() logger.info("Run completed.") logger.info("Writing link results to {0}".format( input_data.links_outfile.name)) markov_chain.state_recorder.write_links_probabilities() logger.info("Writing parameter results to {0}".format( input_data.parameters_outfile.name)) markov_chain.state_recorder.write_parameters_probabilities() if input_data.terms_based: logger.info("Writing terms data to {0}.".format( input_data.terms_outfile.name)) markov_chain.state_recorder.write_terms_probabilities() markov_chain.state_recorder.write_transition_states() logger.info("Transitions data written to {0}.".format( input_data.transitions_outfile.name)) if input_data.record_frequencies: logger.info("Writing state frequencies to {0}".format( input_data.frequencies_outfile.name)) if "ArrayMarkovChain" in markov_chain.__class__.__name__: markov_chain.state_recorder.write_state_frequencies( input_data.frequencies_outfile, input_data.activity_threshold, input_data.transition_ratio, input_data.link_false_pos, input_data.link_false_neg, input_data.link_prior, parameters_state_class, links_state_class ) logger.info("State frequencies written.") ending_time = datetime.datetime.now() logger.info("Finished.") running_time = ending_time - starting_time hours, remainder = divmod(running_time.seconds, 3600) minutes, seconds = divmod(remainder, 60) hours += running_time.days * 24 logger.info("Running time: {0}h {1}m {2}s".format(hours, minutes, seconds))
def calculate_and_output_results_resampling( outfileh, pairs, total_pairs, interactions_graph, annotations_dict, num_permutations, use_estimation=True, score_correction=False ): """Calculates the significance of a link between each given pair of annotation terms using resampling of genes annotated by the second term. :Parameters: - `outfileh`: a file handle to a file for output - `pairs`: an iterable of pairs of annotation terms - `total_pairs`: the number of total annotation pairs to be processed - `interactions_graph`: graph containing the gene-gene or gene product-gene product interactions - `annotations_dict`: a dictionary with annotation terms as keys and `set`s of genes as values - `num_permutations`: maximum number of permutations to perform [NOTE: see `use_estimation`] - `use_estimation`: estimate significances for pairs which are unlikely to have significant scores [default: `True`] [NOTE: using this option will not guarantee that the number of permutations specified by `num_permutations` will be performed.] - `score_correction`: if `True`, perform correction on scores using an expected value computed from the mean expression value [default: `False`] """ # Create the output CSV file. csv_writer = convutils.make_csv_dict_writer(outfileh, OUTFILE_FIELDS) # Set up the test results iterator. significance_results = compute_significance_for_pairs( pairs, interactions_graph, annotations_dict, num_permutations, use_estimation, score_correction ) results_for_output = [] # Output the test results. for i, pair_results in enumerate(significance_results): results_for_output.append(pair_results) # periodically flush results to disk if not ((i + 1) % RESULTS_BUFFER_SIZE): percent_done = int( math.floor(100 * (i + 1) / float(total_pairs)) ) logger.info("%d of %d (%d%%) pairs processed. " "Writing to %s." % (i + 1, total_pairs, percent_done, outfileh.name) ) write_results_to_csv(csv_writer, results_for_output) # flush the scores results_for_output = [] outfileh.flush() logger.info("All %d pairs processed." % total_pairs) if results_for_output: logger.info("Writing to %s" % outfileh.name) write_results_to_csv(csv_writer, results_for_output)