def process_ppi1(irefindex_file, id_logfile, output_logfile, biochem_file, binary_file, complexes_file, obo_file, biogrid_ptm_codes_file, filtered_pmids_file=None, accepted_taxids=None): counts = Phase1Counts() filtered_pmids = read_filtered_pmids(filtered_pmids_file) id_map = read_id_mapping(id_logfile) obo_fp = open(obo_file, 'rU') ontology = obo.OBOntology(obo_fp) biochem_filter = _BiochemFilter(ontology, biogrid_ptm_codes_file) complex_filter = _ComplexFilter(ontology) input_fp = open(irefindex_file, 'rU') removed_fp = NullFile() biochem_fp = open(biochem_file, 'w') binary_fp = open(binary_file, 'w') complex_fp = open(complexes_file, 'w') logfile_fp = open(output_logfile, 'w') output_fps = (removed_fp, binary_fp, complex_fp, biochem_fp) for fp in output_fps: Interaction.write_header(fp) scanner = parse_mitab_file(input_fp, full_mitab_iterator, None, iRefIndexInteraction) for interaction, lines in scanner: line_numbers = lines[1] res = _process_interaction(interaction, id_map, filtered_pmids, logfile_fp, counts, line_numbers, ontology, biochem_filter, complex_filter, accepted_taxids) interaction.to_file(output_fps[res]) counts.to_file(logfile_fp) input_fp.close() obo_fp.close() logfile_fp.close() for fp in output_fps: fp.close()
def process_ppi2(input_file, output_file, output_logfile, skipped_pmids_file=None, max_complex_size=120, min_complex_size=3): counts = Phase2Counts() logfile_fp = open(output_logfile, 'w') skipped_pmids = read_filtered_pmids(skipped_pmids_file) skipped_pmids.add(0) # this is invalid pmid - not really a paper input_fp = open(input_file, 'rU') output_fp = open(output_file, 'w') Interaction.write_header(output_fp) deflator = ComplexDeflator(logfile_fp, max_complex_size, min_complex_size) for pmid, pairs, complexes in _parse_by_pmid(input_fp): counts.initial_pairs += len(pairs) counts.C += len(complexes) counts.pmids += 1 if pmid in skipped_pmids or len(pairs) < (min_complex_size - 1): new_complexes = [] unused_pairs = pairs else: new_complexes, unused_pairs = deflator(pmid, pairs, complexes) _write_unused_pairs(output_fp, unused_pairs) _write_existing_complexes(output_fp, complexes) _write_new_complexes(output_fp, new_complexes) counts.unused_pairs += len(unused_pairs) for intr in new_complexes: code = intr.edgetype counter = getattr(counts, code) counter += 1 setattr(counts, code, counter) counts.to_file(logfile_fp) input_fp.close() output_fp.close() logfile_fp.close()