def main(): args = parseOptions() uniprot_swiss_fname = expanduser(args.swiss_file) uniprot_trembl_fname = expanduser(args.trembl_file) output_file = args.output_file gencode_ds_loc = expanduser(args.gencode_ds) out_tx_matches = args.out_tx_matches out_tx_matches_fp = file(out_tx_matches, 'w') setup_logging() uniprot_tsv = expanduser(args.uniprot_tsv) blast_exe = args.blast_exe gencode_ds = DatasourceFactory.createDatasource( configFilename=gencode_ds_loc, leafDir=os.path.dirname(gencode_ds_loc)) #uniprotDS = DatasourceFactory.createDatasource(configFilename=uniprot_ds_loc, leafDir=os.path.dirname(uniprot_ds_loc)) uniprotDS = GenericTranscriptDatasource(src_file=uniprot_tsv, title="UniProt", version="2014_12", geneColumnName="gene") tmp_dir = args.temp_pickle_store if tmp_dir is None: tmp_dir = mkdtemp(prefix="onco_unipickles_") swiss_data = parseWithShove(uniprot_swiss_fname, parse_uniprot_data, tmp_dir) trembl_data = parseWithShove(uniprot_trembl_fname, parse_uniprot_data, tmp_dir) alignmentDB = Shove("file://" + output_file, "simple://") # Go through each transcript txs = gencode_ds.getTranscriptDict() tx_ids = txs.keys() num_tx_ids = len(tx_ids) swissKeys = swiss_data.keys() tremblKeys = trembl_data.keys() uniprotEntryNameKey = 'UniProt_uniprot_entry_name' numNotInProteinSeqs = 0 numTranscriptsNotInUniprot = 0 ctr = 0 process_list = [] tpool = Pool(processes=4) for tx_id in tx_ids: ctr += 1 if (ctr % 2000) == 0: logging.getLogger(__name__).info(str(ctr) + "/" + str(num_tx_ids)) tx_protein_seq = txs[tx_id].get_protein_seq() if tx_protein_seq is None or tx_protein_seq.strip( ) == "" or tx_protein_seq.strip() == "*": numNotInProteinSeqs += 1 continue # Create a fake dummy mutation and annotate the gene and the simple_uniprot info m = MutationDataFactory.default_create() m.createAnnotation('gene', txs[tx_id].get_gene()) m.createAnnotation('transcript_id', tx_id) m = uniprotDS.annotate_mutation(m) uniprot_entry_key = m[uniprotEntryNameKey] if uniprot_entry_key in swissKeys: uniprot_record = swiss_data[uniprot_entry_key] elif uniprot_entry_key in tremblKeys: uniprot_record = trembl_data[uniprot_entry_key] else: numTranscriptsNotInUniprot += 1 continue uniprot_seq = uniprot_record.sequence # print(m['transcript_id'] + " " + m[uniprotEntryNameKey]) # "/bulk/blast-2.2.26/bin/bl2seq" is blast_exe for Lee's laptop VM # When doing the comparison, tx protein includes stop codon at the end, uniprot does not. if tx_protein_seq[0:-1] == uniprot_seq: out_tx_matches_fp.write(tx_id + "\n") # runAlignment(tx_id, uniprot_entry_key, tx_protein_seq, uniprot_seq, tmp_dir, blast_exe, alignmentDB) p = (tx_id, uniprot_entry_key, tx_protein_seq, uniprot_seq, tmp_dir, blast_exe) process_list.append(p) # # Running all of the processes.... logging.getLogger( __name__).info("Running big block of alignments across multicores (" + str(len(process_list)) + " alignments)") alignment_data_tuples = tpool.map(run_alignment_given_tuple, process_list) # logging.getLogger(__name__).info("Running big block of alignments across one core (" + str(len(process_list)) + " alignments)") # alignment_data_tuples = [run_alignment_given_tuple(p) for p in process_list] logging.getLogger(__name__).info("Storing results") for t in alignment_data_tuples: alignmentDB[t[0]] = t[1] logging.getLogger(__name__).info("Could not get protein seq for " + str(numNotInProteinSeqs) + " transcripts.") logging.getLogger(__name__).info("Could not get uniprot seq for " + str(numTranscriptsNotInUniprot) + " transcripts.") logging.getLogger(__name__).info("Attempted " + str(ctr) + " muts")
default="pickles/", help="Where to place temporary cached files. Default: %(default)s") parser.add_argument( "output_file", type=str, help= "TSV filename for output. File will be overwritten if it already exists." ) args = parser.parse_args() return args if __name__ == '__main__': setup_logging() args = parseOptions() uniprot_swiss_fname = args.swiss_file uniprot_trembl_fname = args.trembl_file output_file = args.output_file uniprot_tsv = args.uniprot_tsv pickles_dir = os.path.abspath(os.path.expanduser(args.pickles)) + "/" # Go through every record and create an entry for the outputHeaders = [ "gene", "startAA", "endAA", "region", "site", "natural_variation", "experimental_info" ] tsvWriter = csv.DictWriter(open(output_file, 'w'), outputHeaders,
''' parser = ArgumentParser(description=desc, formatter_class=RawDescriptionHelpFormatter, epilog=epilog) parser.add_argument("swiss_file", type=str, help="SwissProt file. ") parser.add_argument("trembl_file", type=str, help="TREMBL file. ") parser.add_argument("gencode_ds", type=str, help="GENCODE datasource config file. ") parser.add_argument("uniprot_tsv", type=str, help="Uniprot TSV file (used in a simple_uniprot datasource) file. ") parser.add_argument("-p", "--pickles", type=str, default="pickles/", help="Where to place temporary cached files. Default: %(default)s") parser.add_argument("output_file", type=str, help="TSV filename for output. File will be overwritten if it already exists.") args = parser.parse_args() return args if __name__ == '__main__': setup_logging() args = parseOptions() uniprot_swiss_fname = args.swiss_file uniprot_trembl_fname = args.trembl_file output_file = args.output_file uniprot_tsv = args.uniprot_tsv pickles_dir = os.path.abspath(os.path.expanduser(args.pickles)) + "/" # Go through every record and create an entry for the outputHeaders = ["gene", "startAA", "endAA", "region", "site", "natural_variation","experimental_info"] tsvWriter = csv.DictWriter(open(output_file, 'w'), outputHeaders, extrasaction='ignore', delimiter="\t", lineterminator="\n") tsvWriter.writeheader() # TODO: Reduce code duplication