Example #1
0
def main():
    args = parseOptions()
    uniprot_swiss_fname = expanduser(args.swiss_file)
    uniprot_trembl_fname = expanduser(args.trembl_file)
    output_file = args.output_file
    gencode_ds_loc = expanduser(args.gencode_ds)

    out_tx_matches = args.out_tx_matches
    out_tx_matches_fp = file(out_tx_matches, 'w')

    setup_logging()

    uniprot_tsv = expanduser(args.uniprot_tsv)
    blast_exe = args.blast_exe

    gencode_ds = DatasourceFactory.createDatasource(
        configFilename=gencode_ds_loc, leafDir=os.path.dirname(gencode_ds_loc))
    #uniprotDS = DatasourceFactory.createDatasource(configFilename=uniprot_ds_loc, leafDir=os.path.dirname(uniprot_ds_loc))
    uniprotDS = GenericTranscriptDatasource(src_file=uniprot_tsv,
                                            title="UniProt",
                                            version="2014_12",
                                            geneColumnName="gene")

    tmp_dir = args.temp_pickle_store
    if tmp_dir is None:
        tmp_dir = mkdtemp(prefix="onco_unipickles_")
    swiss_data = parseWithShove(uniprot_swiss_fname, parse_uniprot_data,
                                tmp_dir)
    trembl_data = parseWithShove(uniprot_trembl_fname, parse_uniprot_data,
                                 tmp_dir)
    alignmentDB = Shove("file://" + output_file, "simple://")

    # Go through each transcript
    txs = gencode_ds.getTranscriptDict()
    tx_ids = txs.keys()
    num_tx_ids = len(tx_ids)
    swissKeys = swiss_data.keys()
    tremblKeys = trembl_data.keys()

    uniprotEntryNameKey = 'UniProt_uniprot_entry_name'

    numNotInProteinSeqs = 0
    numTranscriptsNotInUniprot = 0
    ctr = 0
    process_list = []
    tpool = Pool(processes=4)
    for tx_id in tx_ids:
        ctr += 1
        if (ctr % 2000) == 0:
            logging.getLogger(__name__).info(str(ctr) + "/" + str(num_tx_ids))

        tx_protein_seq = txs[tx_id].get_protein_seq()
        if tx_protein_seq is None or tx_protein_seq.strip(
        ) == "" or tx_protein_seq.strip() == "*":
            numNotInProteinSeqs += 1
            continue

        # Create a fake dummy mutation and annotate the gene and the simple_uniprot info
        m = MutationDataFactory.default_create()
        m.createAnnotation('gene', txs[tx_id].get_gene())
        m.createAnnotation('transcript_id', tx_id)
        m = uniprotDS.annotate_mutation(m)
        uniprot_entry_key = m[uniprotEntryNameKey]
        if uniprot_entry_key in swissKeys:
            uniprot_record = swiss_data[uniprot_entry_key]

        elif uniprot_entry_key in tremblKeys:
            uniprot_record = trembl_data[uniprot_entry_key]
        else:
            numTranscriptsNotInUniprot += 1
            continue
        uniprot_seq = uniprot_record.sequence

        # print(m['transcript_id'] + " " + m[uniprotEntryNameKey])
        # "/bulk/blast-2.2.26/bin/bl2seq" is blast_exe for Lee's laptop VM

        # When doing the comparison, tx protein includes stop codon at the end, uniprot does not.
        if tx_protein_seq[0:-1] == uniprot_seq:
            out_tx_matches_fp.write(tx_id + "\n")

        # runAlignment(tx_id, uniprot_entry_key, tx_protein_seq, uniprot_seq, tmp_dir, blast_exe, alignmentDB)
        p = (tx_id, uniprot_entry_key, tx_protein_seq, uniprot_seq, tmp_dir,
             blast_exe)
        process_list.append(p)

    # # Running all of the processes....
    logging.getLogger(
        __name__).info("Running big block of alignments across multicores (" +
                       str(len(process_list)) + " alignments)")
    alignment_data_tuples = tpool.map(run_alignment_given_tuple, process_list)
    # logging.getLogger(__name__).info("Running big block of alignments across one core (" + str(len(process_list)) + " alignments)")
    # alignment_data_tuples = [run_alignment_given_tuple(p) for p in process_list]

    logging.getLogger(__name__).info("Storing results")
    for t in alignment_data_tuples:
        alignmentDB[t[0]] = t[1]

    logging.getLogger(__name__).info("Could not get protein seq for " +
                                     str(numNotInProteinSeqs) +
                                     " transcripts.")
    logging.getLogger(__name__).info("Could not get uniprot seq for " +
                                     str(numTranscriptsNotInUniprot) +
                                     " transcripts.")
    logging.getLogger(__name__).info("Attempted " + str(ctr) + " muts")
        default="pickles/",
        help="Where to place temporary cached files. Default: %(default)s")
    parser.add_argument(
        "output_file",
        type=str,
        help=
        "TSV filename for output.  File will be overwritten if it already exists."
    )

    args = parser.parse_args()
    return args


if __name__ == '__main__':

    setup_logging()
    args = parseOptions()
    uniprot_swiss_fname = args.swiss_file
    uniprot_trembl_fname = args.trembl_file
    output_file = args.output_file
    uniprot_tsv = args.uniprot_tsv

    pickles_dir = os.path.abspath(os.path.expanduser(args.pickles)) + "/"

    # Go through every record and create an entry for the
    outputHeaders = [
        "gene", "startAA", "endAA", "region", "site", "natural_variation",
        "experimental_info"
    ]
    tsvWriter = csv.DictWriter(open(output_file, 'w'),
                               outputHeaders,
    '''
    parser = ArgumentParser(description=desc, formatter_class=RawDescriptionHelpFormatter, epilog=epilog)
    parser.add_argument("swiss_file", type=str, help="SwissProt file. ")
    parser.add_argument("trembl_file", type=str, help="TREMBL file. ")
    parser.add_argument("gencode_ds", type=str, help="GENCODE datasource config file. ")
    parser.add_argument("uniprot_tsv", type=str, help="Uniprot TSV file (used in a simple_uniprot datasource) file. ")
    parser.add_argument("-p", "--pickles", type=str, default="pickles/", help="Where to place temporary cached files. Default: %(default)s")
    parser.add_argument("output_file", type=str, help="TSV filename for output.  File will be overwritten if it already exists.")

    args = parser.parse_args()
    return args

if __name__ == '__main__':

    setup_logging()
    args = parseOptions()
    uniprot_swiss_fname = args.swiss_file
    uniprot_trembl_fname = args.trembl_file
    output_file = args.output_file
    uniprot_tsv = args.uniprot_tsv

    pickles_dir = os.path.abspath(os.path.expanduser(args.pickles)) + "/"

    # Go through every record and create an entry for the
    outputHeaders = ["gene", "startAA", "endAA", "region", "site", "natural_variation","experimental_info"]
    tsvWriter = csv.DictWriter(open(output_file, 'w'), outputHeaders, extrasaction='ignore', delimiter="\t", lineterminator="\n")
    tsvWriter.writeheader()

    # TODO: Reduce code duplication