def insert_best_seqid_column(params): csv = params['csv'] uniprot_ids_header = params['uniprot_ids_header'] delimiter = params['delimiter'] out_csv = params['output_csv'] uniprot_cache = params['cache_dir'] if not csv: raise IOError('No file selected') if not os.path.isfile(csv): raise IOError(csv + ' not found') headers = get_headers(csv) if uniprot_ids_header not in headers: s = "Column header '%s' not found, available headers:\n" % uniprot_ids_header for header in headers: s += ' ' + header + '\n' raise IOError(s) logging('Reading %s\n' % csv) entries = read_csv(csv) all_seqids = [] for entry in entries: tokens = entry[uniprot_ids_header].split(delimiter) entry['seqids'] = [s.strip() for s in tokens] all_seqids.extend(entry['seqids']) logging('Found %d potential Uniprot IDs\n' % len(all_seqids)) uniprot_data = uniprot.batch_uniprot_metadata( all_seqids, uniprot_cache) for entry in entries: best_seqid = uniprot.sort_seqids_by_uniprot( entry['seqids'], uniprot_data)[0] entry['best_seqid'] = best_seqid entry['is_reviewed'] = False if best_seqid in uniprot_data: entry['is_reviewed'] = \ uniprot_data[best_seqid]['is_reviewed'] logging('Writing ') logging('%s\n' % os.path.abspath(out_csv), lambda: open_file(out_csv)) headers = ['best_seqid', 'is_reviewed'] + get_headers(csv) rows = [headers] for entry in entries: rows.append([entry[h] for h in headers]) write_csv(out_csv, rows)
def add_uniprot_data(protxml_groups, cache_file=None): """ Processes the data from an PROTXML file, reads the seqids, and attempts to mapt to a UNIPROT ID and then, fetch the metadata of that protein from the uniprot.org website: organism, gene, description etc. """ seqids = get_all_seqids(protxml_groups) if is_url_connected('http://uniprot.org'): uniprot_dict = uniprot.get_metadata_with_some_seqid_conversions( seqids, cache_file) else: print "Can't connect to www.uniprot.org, won't use uniprot metatdata" uniprot_dict = {} for group_id, protxml_group in protxml_groups.items(): for protein in protxml_group['proteins']: protein['id'] = '' protein['acc'] = protein['protein_name'] names = [protein['protein_name']] + protein['other_seqids'] new_seqid = uniprot.sort_seqids_by_uniprot(names, uniprot_dict)[0] if new_seqid != protein['protein_name']: print "Protein group %s%s is better represented with %s than %s" % \ (group_id, protein['group_sibling_id'], uniprot.get_naked_seqid(new_seqid), uniprot.get_naked_seqid(protein['protein_name'])) protein['protein_name'] = new_seqid protein['other_seqids'] = names[1:] protein['acc'] = new_seqid protein['other_seqids'] = ';'.join(protein['other_seqids']) if new_seqid not in uniprot_dict: print "No uniprot metadata for protein group %s%s seqid %s" % \ (group_id, protein['group_sibling_id'], uniprot.get_naked_seqid(new_seqid)) continue protein['link'] = '' uniprot_entry = uniprot_dict[new_seqid] protein['id'] = uniprot_entry['id'] protein['acc'] = uniprot_entry['accs'][0] protein['link'] = \ '=HYPERLINK("http://uniprot.org/uniprot/%s")' % \ uniprot_dict[new_seqid]['id'] if 'gene' in uniprot_entry: protein['gene'] = uniprot_entry['gene'] if 'organism' in uniprot_entry: protein['organism'] = uniprot_entry['organism'] protein['description'] = '; '.join(uniprot_entry['descriptions']) if 'length' in uniprot_entry: protein['length'] = uniprot_entry['length']
def add_uniprot_data(protxml_groups, cache_file=None): """ Processes the data from an PROTXML file, reads the seqids, and attempts to mapt to a UNIPROT ID and then, fetch the metadata of that protein from the uniprot.org website: organism, gene, description etc. """ seqids = get_all_seqids(protxml_groups) if is_url_connected('http://uniprot.org'): uniprot_dict = uniprot.get_metadata_with_some_seqid_conversions(seqids, cache_file) else: print "Can't connect to www.uniprot.org, won't use uniprot metatdata" uniprot_dict = {} for group_id, protxml_group in protxml_groups.items(): for protein in protxml_group['proteins']: protein['id'] = '' protein['acc'] = protein['protein_name'] names = [protein['protein_name']] + protein['other_seqids'] new_seqid = uniprot.sort_seqids_by_uniprot(names, uniprot_dict)[0] if new_seqid != protein['protein_name']: print "Protein group %s%s is better represented with %s than %s" % \ (group_id, protein['group_sibling_id'], uniprot.get_naked_seqid(new_seqid), uniprot.get_naked_seqid(protein['protein_name'])) protein['protein_name'] = new_seqid protein['other_seqids'] = names[1:] protein['acc'] = new_seqid protein['other_seqids'] = ';'.join(protein['other_seqids']) if new_seqid not in uniprot_dict: print "No uniprot metadata for protein group %s%s seqid %s" % \ (group_id, protein['group_sibling_id'], uniprot.get_naked_seqid(new_seqid)) continue protein['link'] = '' uniprot_entry = uniprot_dict[new_seqid] protein['id'] = uniprot_entry['id'] protein['acc'] = uniprot_entry['accs'][0] protein['link'] = \ '=HYPERLINK("http://uniprot.org/uniprot/%s")' % \ uniprot_dict[new_seqid]['id'] if 'gene' in uniprot_entry: protein['gene'] = uniprot_entry['gene'] if 'organism' in uniprot_entry: protein['organism'] = uniprot_entry['organism'] protein['description'] = '; '.join(uniprot_entry['descriptions']) if 'length' in uniprot_entry: protein['length'] = uniprot_entry['length']
def insert_best_seqid_column(params): csv = params['csv'] uniprot_ids_header = params['uniprot_ids_header'] delimiter = params['delimiter'] out_csv = params['output_csv'] uniprot_cache = params['cache_dir'] if not csv: raise IOError('No file selected') if not os.path.isfile(csv): raise IOError(csv + ' not found') headers = get_headers(csv) if uniprot_ids_header not in headers: s = "Column header '%s' not found, available headers:\n" % uniprot_ids_header for header in headers: s += ' ' + header + '\n' raise IOError(s) logging('Reading %s\n' % csv) entries = read_csv(csv) all_seqids = [] for entry in entries: tokens = entry[uniprot_ids_header].split(delimiter) entry['seqids'] = [s.strip() for s in tokens] all_seqids.extend(entry['seqids']) logging('Found %d potential Uniprot IDs\n' % len(all_seqids)) uniprot_data = uniprot.batch_uniprot_metadata(all_seqids, uniprot_cache) for entry in entries: best_seqid = uniprot.sort_seqids_by_uniprot(entry['seqids'], uniprot_data)[0] entry['best_seqid'] = best_seqid entry['is_reviewed'] = False if best_seqid in uniprot_data: entry['is_reviewed'] = \ uniprot_data[best_seqid]['is_reviewed'] logging('Writing ') logging('%s\n' % os.path.abspath(out_csv), lambda: open_file(out_csv)) headers = ['best_seqid', 'is_reviewed'] + get_headers(csv) rows = [headers] for entry in entries: rows.append([entry[h] for h in headers]) write_csv(out_csv, rows)