def get_sequences(): data = list() with open('data/cafa3/targets.txt', 'r') as f: for line in f: items = line.strip().split('\t') if is_ok(items[1]): data.append(items[1]) with open('data/cafa3/data.txt', 'r') as f: for line in f: items = line.strip().split('\t') if is_ok(items[1]): data.append(items[1]) return data
def get_functions(annot_num): df = pd.read_pickle(DATA_ROOT + 'swissprot_exp.pkl') annots = dict() for i, row in df.iterrows(): go_set = set() if not is_ok(row['sequences']): continue for go_id in row['annots']: go_id = go_id.split('|') if go_id[1] not in EXP_CODES: continue go_id = go_id[0] if go_id in func_set: go_set |= get_anchestors(go, go_id) for go_id in go_set: if go_id not in annots: annots[go_id] = 0 annots[go_id] += 1 filtered = list() for go_id in functions: if go_id in annots and annots[go_id] >= annot_num: filtered.append(go_id) print(len(filtered)) df = pd.DataFrame({'functions': filtered}) df.to_pickle(DATA_ROOT + FUNCTION + '.pkl') print('Saved ' + DATA_ROOT + FUNCTION + '.pkl')
def load_data(): ngram_df = pd.read_pickle(DATA_ROOT + 'ngrams.pkl') vocab = {} for key, gram in enumerate(ngram_df['ngrams']): vocab[gram] = key + 1 gram_len = len(ngram_df['ngrams'][0]) print('Gram length:', gram_len) print('Vocabulary size:', len(vocab)) proteins = list() gos = list() labels = list() ngrams = list() sequences = list() accessions = list() df = pd.read_pickle(DATA_ROOT + 'swissprot_exp.pkl') # Filtering data by sequences index = list() for i, row in df.iterrows(): if is_ok(row['sequences']): index.append(i) df = df.loc[index] for i, row in df.iterrows(): go_list = [] for item in row['annots']: items = item.split('|') if items[1] in EXP_CODES: go_list.append(items[0]) # go_list.append(items[0]) go_set = set() for go_id in go_list: if go_id in func_set: go_set |= get_anchestors(go, go_id) if not go_set or GO_ID not in go_set: continue go_set.remove(GO_ID) gos.append(go_list) proteins.append(row['proteins']) accessions.append(row['accessions']) seq = row['sequences'] sequences.append(seq) grams = np.zeros((len(seq) - gram_len + 1, ), dtype='int32') for i in xrange(len(seq) - gram_len + 1): grams[i] = vocab[seq[i:(i + gram_len)]] ngrams.append(grams) label = np.zeros((len(functions), ), dtype='int32') for go_id in go_set: if go_id in go_indexes: label[go_indexes[go_id]] = 1 labels.append(label) res_df = pd.DataFrame({ 'accessions': accessions, 'proteins': proteins, 'ngrams': ngrams, 'labels': labels, 'gos': gos, 'sequences': sequences }) print(len(res_df)) return res_df
def to_pandas(): ngram_df = pd.read_pickle('data/ngrams.pkl') vocab = {} for key, gram in enumerate(ngram_df['ngrams']): vocab[gram] = key + 1 gram_len = len(ngram_df['ngrams'][0]) print('Gram length:', gram_len) print('Vocabulary size:', len(vocab)) proteins = list() accessions = list() sequences = list() interpros = list() ngrams = list() indexes = list() counter = Counter() maxlen = 0 with open('data/data.tsv') as f: for line in f: items = line.strip().split('\t') seq = items[2] if not is_ok(seq) or len(seq) > 1600: continue proteins.append(items[0]) accessions.append(items[1].split(';')[0]) maxlen = max(maxlen, len(seq)) sequences.append(seq) grams = list() for i in range(len(seq) - gram_len + 1): grams.append(vocab[seq[i: (i + gram_len)]]) index = np.array([AAINDEX[x] for x in seq]) indexes.append(index) ngrams.append(np.array(grams)) interpros.append(items[3:]) for item in items[3:]: counter[item] += 1 print('Maximum sequence length: ', maxlen) df = pd.DataFrame({ 'proteins': proteins, 'accessions': accessions, 'sequences': sequences, 'ngrams': ngrams, 'interpros': interpros, 'indexes': indexes }) print(df) df.to_pickle('data/data.pkl') dictionary = list() for ipro, cnt in counter.items(): if cnt >= 100: dictionary.append(ipro) dict_df = pd.DataFrame({'interpros': dictionary}) print(dict_df) dict_df.to_pickle('data/dictionary.pkl')
def get_data(): proteins = list() targets = list() orgs = list() ngrams = list() ngram_df = pd.read_pickle('data/eshark/ngrams.pkl') vocab = {} mapping = get_blast_mapping() for key, gram in enumerate(ngram_df['ngrams']): vocab[gram] = key + 1 gram_len = len(ngram_df['ngrams'][0]) print('Gram length:', gram_len) print('Vocabulary size:', len(vocab)) with open('data/eshark/targets.txt') as f: for line in f: it = line.strip().split('\t') seq = it[1] if is_ok(seq): # orgs.append(it[0]) targets.append(it[0]) if it[0] in mapping: proteins.append(mapping[it[0]]) else: proteins.append('') grams = np.zeros((len(seq) - gram_len + 1, ), dtype='int32') for i in xrange(len(seq) - gram_len + 1): grams[i] = vocab[seq[i:(i + gram_len)]] ngrams.append(grams) df = pd.DataFrame({ 'targets': targets, 'accessions': proteins, 'ngrams': ngrams }) print(len(df)) embed_df = pd.read_pickle('data/graph_new_embeddings.pkl') df = pd.merge(df, embed_df, on='accessions', how='left') missing_rep = 0 for i, row in df.iterrows(): if not isinstance(row['embeddings'], np.ndarray): row['embeddings'] = np.zeros((256, ), dtype='float32') missing_rep += 1 print(missing_rep) df.to_pickle('data/eshark/targets.pkl')
def filter_exp(): df = pd.read_pickle(DATA_ROOT + 'swissprot.pkl') exp_codes = set(['EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'IEP', 'TAS', 'IC']) index = list() for i, row in df.iterrows(): ok = False for go_id in row['annots']: code = go_id.split('|')[1] if code in exp_codes: ok = True break if ok and is_ok(row['sequences']): index.append(i) df = df.loc[index] print(len(df)) df.to_pickle(DATA_ROOT + 'swissprot_exp.pkl')
def cafa3(): root = 'data/cafa3/CAFA3_training_data/' filename = root + 'uniprot_sprot_exp.fasta' data = read_fasta(filename) annots = dict() with open(root + 'uniprot_sprot_exp.txt') as f: for line in f: items = line.strip().split('\t') if items[0] not in annots: annots[items[0]] = set() annots[items[0]].add(items[1]) fl = open(root + 'uniprot_sprot.tab', 'w') for line in data: items = line.split('\t') if is_ok(items[1]) and items[0] in annots: fl.write(line + '\t') gos = list(annots[items[0]]) fl.write(gos[0]) for go_id in gos[1:]: fl.write('; ' + go_id) fl.write('\n')
def load_data(): ngram_df = pd.read_pickle('data/ngrams.pkl') vocab = {} for key, gram in enumerate(ngram_df['ngrams']): vocab[gram] = key + 1 gram_len = len(ngram_df['ngrams'][0]) print('Gram length:', gram_len) print('Vocabulary size:', len(vocab)) ngrams = list() proteins = list() f = open('data/swissprot.fasta') prots, seqs = read_fasta(f.readlines()) for protein, seq in zip(prots, seqs): if not is_ok(seq) or len(seq) - gram_len + 1 > MAXLEN: continue proteins.append(protein) grams = list() for i in range(len(seq) - gram_len + 1): grams.append(vocab[seq[i: (i + gram_len)]]) ngrams.append(grams) df = pd.DataFrame({ 'proteins': proteins, 'ngrams': ngrams, }) def get_values(df): grows = [] gcols = [] gdata = [] for i, row in enumerate(df.itertuples()): for j in range(len(row.ngrams)): grows.append(i) gcols.append(j) gdata.append(row.ngrams[j]) data = sparse.csr_matrix((gdata, (grows, gcols)), shape=(len(df), MAXLEN)) return data return proteins, get_values(df)
def load_data(split=0.9): ngrams = list() df = pd.read_pickle('data/sw_scores.pkl') prot_index = {} for row in df.itertuples(): seq = row.sequences if not is_ok(seq) or len(seq) > MAXLEN: continue grams = list(map(lambda x: AAINDEX[x], seq)) ngrams.append(grams) prot_index[row.proteins] = len(prot_index) df['ngrams'] = ngrams n = len(df) index = np.arange(n) np.random.seed(seed=0) np.random.shuffle(index) train_n = int(n * split) valid_n = int(train_n * split) train_df = df.iloc[index[:valid_n]] valid_df = df.iloc[index[valid_n:train_n]] test_df = df.iloc[index[train_n:]] def get_values(df): index = np.zeros((len(df), ), dtype=np.int32) data = np.zeros((len(df), MAXLEN, 21), dtype=np.float32) for i, row in enumerate(df.itertuples()): for j in range(len(row.ngrams)): data[i, j, row.ngrams[j]] = 1 index[i] = prot_index[row.proteins] scores = df['scores'].values for i in range(len(scores)): scores[i] = scores[i][index] return data, scores train, valid, test = get_values(train_df), get_values( valid_df), get_values(test_df) return train, valid, test
def to_pickle_org(org='mouse'): proteins = list() accessions = list() sequences = list() length = list() status = list() ngrams = list() ngram_df = pd.read_pickle(DATA_ROOT + 'ngrams.pkl') vocab = {} for key, gram in enumerate(ngram_df['ngrams']): vocab[gram] = key + 1 gram_len = len(ngram_df['ngrams'][0]) print('Gram length:', gram_len) print('Vocabulary size:', len(vocab)) with gzip.open(DATA_ROOT + 'uniprot-' + org + '.tab.gz') as f: next(f) for line in f: items = line.strip().split('\t') seq = items[2] if not is_ok(seq): continue proteins.append(items[1]) accessions.append(items[0]) sequences.append(seq) length.append(int(items[3])) status.append(items[4]) grams = np.zeros((len(seq) - gram_len + 1, ), dtype='int32') for i in xrange(len(seq) - gram_len + 1): grams[i] = vocab[seq[i:(i + gram_len)]] ngrams.append(grams) # with open('data/cafa3/tremble_data.tab') as f: # for line in f: # items = line.strip().split('\t') # if items[0] not in prots: # prots.add(items[0]) # proteins.append(items[0]) # accessions.append(items[1]) # sequences.append(items[2]) # with open('data/cafa3/uniprot_trembl.tab') as f: # for line in f: # items = line.strip().split('\t') # if items[1] not in prots: # proteins.append(items[1]) # accessions.append(items[0]) # sequences.append(items[2]) df = pd.DataFrame({ 'proteins': proteins, 'accessions': accessions, 'sequences': sequences, 'length': length, 'status': status, 'ngrams': ngrams }) print(len(df)) df.to_pickle(DATA_ROOT + org + '-sequences.pkl') # Filter reviewed df = df[df['status'] == 'reviewed'] print(len(df)) print('Loading embeddings') rep_df = pd.read_pickle('data/graph_new_embeddings.pkl') embeds = {} for i, row in rep_df.iterrows(): embeds[row['accessions']] = row['embeddings'] df = pd.merge(df, rep_df, on='accessions', how='left') p = Popen([ 'blastp', '-db', 'data/embeddings.fa', '-max_target_seqs', '1', '-num_threads', '128', '-outfmt', '6 qseqid sseqid' ], stdin=PIPE, stdout=PIPE) missing_rep = 0 for i, row in df.iterrows(): if not isinstance(row['embeddings'], np.ndarray): p.stdin.write('>' + row['accessions'] + '\n' + row['sequences'] + '\n') missing_rep += 1 print('Starting blastp for %d' % missing_rep) p.stdin.close() embed_map = {} if p.wait() == 0: for line in p.stdout: print(line) it = line.strip().split('\t') embed_map[it[0]] = it[1] missing_rep = 0 for i, row in df.iterrows(): if not isinstance(row['embeddings'], np.ndarray): if row['accessions'] in embed_map: df.at[i, 'embeddings'] = embeds[embed_map[row['accessions']]] else: df.at[i, 'embeddings'] = np.zeros((256, ), dtype=np.float32) missing_rep += 1 print('Missing reps: ', missing_rep) df.to_pickle(DATA_ROOT + org + '-data.pkl')