def process(line_sources): """ @param line_sources: sources of line iterables """ # get the headers and data from all of the input sources header_data_pairs = [hud.decode(lines) for lines in line_sources] header_list, data_list = zip(*header_data_pairs) # get the header to index map for each input source h_to_i_list = [Util.inverse_map(x) for x in header_list] # get the intersection of headers in all lists header_sets = [set(x) for x in header_list] header_intersection = set.intersection(*header_sets) # get the ordered list of all headers unique_headers = list( iterutils.unique_everseen(itertools.chain.from_iterable(header_list))) # get the ordered list of headers present in every input source out_headers = [h for h in unique_headers if h in header_intersection] out_data = [] for h in out_headers: row = [] for data, h_to_i in zip(data_list, h_to_i_list): if h in h_to_i: row.extend(data[h_to_i[h]]) out_data.append(row) return hud.encode(out_headers, out_data) + '\n'
def process(args, raw_hud_lines): """ @param args: user options from the web or cmdline @param hud_lines: raw lines of a .hud file @return: results in convenient text form """ out = StringIO() names, data = hud.decode(raw_hud_lines) # normalize the names of the isolates if args.clean_isolates: names = [Carbone.clean_isolate_element(x) for x in names] # get the pcs C_full = np.array(data, dtype=float) pcs = eigenpop.get_scaled_eigenvectors(C_full, args.diploid_and_biallelic) # check for sufficient number of eigenvectors if len(pcs) < args.npcs: msg_a = 'the number of requested principal components ' msg_b = 'must be no more than the number of OTUs' raise ValueError(msg_a + msg_b) # create the R frame headers = ['otu'] + ['pc%d' % (i + 1) for i in range(args.npcs)] print >> out, '\t'.join(headers) for i, name in enumerate(names): typed_row = [name] + [pcs[j][i] for j in range(args.npcs)] if args.add_indices: typed_row = [i + 1] + typed_row row = [str(x) for x in typed_row] print >> out, '\t'.join(row) return out.getvalue()
def process(line_sources): """ @param line_sources: sources of line iterables """ # get the headers and data from all of the input sources header_data_pairs = [hud.decode(lines) for lines in line_sources] header_list, data_list = zip(*header_data_pairs) # get the header to index map for each input source h_to_i_list = [Util.inverse_map(x) for x in header_list] # get the intersection of headers in all lists header_sets = [set(x) for x in header_list] header_intersection = set.intersection(*header_sets) # get the ordered list of all headers unique_headers = list(iterutils.unique_everseen( itertools.chain.from_iterable(header_list))) # get the ordered list of headers present in every input source out_headers = [h for h in unique_headers if h in header_intersection] out_data = [] for h in out_headers: row = [] for data, h_to_i in zip(data_list, h_to_i_list): if h in h_to_i: row.extend(data[h_to_i[h]]) out_data.append(row) return hud.encode(out_headers, out_data) + '\n'
def process(hud_lines, matpheno_lines): """ @param hud_lines: lines of a .hud file @param matpheno_lines: lines of a MAT_pheno.txt file @return: contents of an .ind file """ # get the ordered names from the .hud file names, hud_data = hud.decode(hud_lines) # get case and control status from the matpheno file cases = set() controls = set() for line in iterutils.stripped_lines(matpheno_lines): name, classification = line.split(None, 1) if classification == '1': cases.add(name) elif classification == '2': controls.add(name) elif classification in ('12', 'null'): # skip individuals classified like this pass else: msg = 'invalid MAT_pheno classification: ' + classification raise Exception(msg) # write the .ind file contents out = StringIO() for name in names: gender = 'U' classification = 'Ignore' if name in cases: classification = 'Case' elif name in controls: classification = 'Control' row = [name, gender, classification] print >> out, '\t'.join(row) return out.getvalue().rstrip()
def process(args, raw_hud_lines, nseconds=2): nwords = args.nwords nchars = args.nchars names, data = hud.decode(raw_hud_lines) out = StringIO() if len(data) < nwords: msg = 'the number of OTUs is smaller than the desired sample' raise HandlingError(msg) if len(data[0]) < nchars: msg = 'the number of characters is smaller than the desired sample' raise HandlingError(msg) # create the matrix M = np.array(data) # select row and column indices row_indices, col_indices = get_selections(M, nwords, nchars, nseconds) sorted_row_indices = list(sorted(row_indices)) sorted_col_indices = list(sorted(col_indices)) # print the separation d = get_separation(M, row_indices, col_indices) print >> out, 'best separation:', d # print the index selections print >> out, 'selected row indices:', sorted_row_indices print >> out, 'selected column indices:', sorted_col_indices # print some selected values for i in sorted_row_indices: s = ' '.join(str(M[i, j]) for j in sorted_col_indices) print >> out, names[i] + '\t' + s return out.getvalue().rstrip()
def process(args, raw_hud_lines): """ @param args: user options from the web or cmdline @param hud_lines: raw lines of a .hud file @return: results in convenient text form """ out = StringIO() names, data = hud.decode(raw_hud_lines) # normalize the names of the isolates if args.clean_isolates: names = [Carbone.clean_isolate_element(x) for x in names] # get the pcs C_full = np.array(data, dtype=float) pcs = eigenpop.get_scaled_eigenvectors(C_full, args.diploid_and_biallelic) # check for sufficient number of eigenvectors if len(pcs) < args.npcs: msg_a = 'the number of requested principal components ' msg_b = 'must be no more than the number of OTUs' raise ValueError(msg_a + msg_b) # create the R frame headers = ['otu'] + ['pc%d' % (i+1) for i in range(args.npcs)] print >> out, '\t'.join(headers) for i, name in enumerate(names): typed_row = [name] + [pcs[j][i] for j in range(args.npcs)] if args.add_indices: typed_row = [i+1] + typed_row row = [str(x) for x in typed_row] print >> out, '\t'.join(row) return out.getvalue()
def process(args, raw_hud_lines): """ @param args: user options from the web or cmdline @param hud_lines: raw lines of a .hud file @return: results in convenient text form """ out = StringIO() names, data = hud.decode(raw_hud_lines) C_full = np.array(data, dtype=float) pcs = eigenpop.get_scaled_eigenvectors(C_full, args.diploid_and_biallelic) axis_index = args.axis - 1 # check for sufficient number of eigenvectors if axis_index >= len(pcs): msg = 'the requested axis is not available' raise ValueError(msg) # compute the correlation of each SNP vector the requested PC pc = pcs[axis_index] corrs = [mycorr(snp, pc) for snp in C_full.T] sqcorrs = [mycorr(snp, pc)**2 for snp in C_full.T] if args.rank_squared: keys = sqcorrs else: keys = corrs corr_index_pairs = [(cor, i) for i, cor in enumerate(keys)] sorted_pairs = list(reversed(sorted(corr_index_pairs))) indices = zip(*sorted_pairs)[1] if args.locus_from_1: nominal_indices = [i+1 for i in indices] else: nominal_indices = indices rows = [(nom_i, corrs[i]) for i, nom_i in zip(indices, nominal_indices)] lines = ['\t'.join(str(x) for x in row) for row in rows] return '\n'.join(lines) + '\n'
def do_pca(hud_lines): """ @param hud_lines: lines of a .hud file @return: names, scaled vectors """ # get the ordered names from the .hud file names, data = hud.decode(hud_lines) # create the floating point count matrix C_full = np.array(data) m_full, n_full = C_full.shape # remove invariant columns C = np.vstack([v for v in C_full.T if len(set(v))>1]).T # get the shape of the matrix m, n = C.shape # get the column means u = C.mean(axis=0) # get the centered and normalized counts matrix M = (C - u) / np.sqrt(u * (1 - u)) # construct the sample covariance matrix X = np.dot(M, M.T) / n # get the eigendecomposition of the covariance matrix evals, evecs = EigUtil.eigh(X) # scale the eigenvectos by the eigenvalues pcs = [w*v for w, v in zip(evals, evecs)] return names, pcs
def do_pca(hud_lines): """ @param hud_lines: lines of a .hud file @return: names, scaled vectors """ # get the ordered names from the .hud file names, data = hud.decode(hud_lines) # create the floating point count matrix C_full = np.array(data) m_full, n_full = C_full.shape # remove invariant columns C = np.vstack([v for v in C_full.T if len(set(v)) > 1]).T # get the shape of the matrix m, n = C.shape # get the column means u = C.mean(axis=0) # get the centered and normalized counts matrix M = (C - u) / np.sqrt(u * (1 - u)) # construct the sample covariance matrix X = np.dot(M, M.T) / n # get the eigendecomposition of the covariance matrix evals, evecs = EigUtil.eigh(X) # scale the eigenvectos by the eigenvalues pcs = [w * v for w, v in zip(evals, evecs)] return names, pcs
def get_response_content(fs): headers, data_rows = hud.decode(fs.table.splitlines()) rtable_header_line = '\t'.join(headers) rows = [] for i, row in enumerate(zip(*data_rows)): rows.append([i] + list(row)) rtable_data_lines = ['\t'.join(str(x) for x in row) for row in rows] return '\n'.join([rtable_header_line] + rtable_data_lines) + '\n'
def get_response_content(fs): headers, data_rows = hud.decode(fs.table.splitlines()) data_transpose = zip(*data_rows) out = StringIO() print >> out, ' '.join(headers) for row in data_transpose: print >> out, ' '.join(str(x) for x in row) return out.getvalue()
def get_response_content(fs): headers, data_rows = hud.decode(fs.table.splitlines()) validate_diploid_data_rows(data_rows) nheaders = len(headers) D = np.zeros((nheaders, nheaders)) for i in range(nheaders): for j in range(nheaders): ri = np.array(data_rows[i]) rj = np.array(data_rows[j]) D[i, j] = np.mean(np.abs(rj - ri)) return '\n'.join('\t'.join(str(x) for x in r) for r in D)
def get_response_content(fs): headers, data_rows = hud.decode(fs.table.splitlines()) validate_diploid_data_rows(data_rows) nheaders = len(headers) D = np.zeros((nheaders, nheaders)) for i in range(nheaders): for j in range(nheaders): ri = np.array(data_rows[i]) rj = np.array(data_rows[j]) D[i, j] = np.mean(np.abs(rj - ri)) return "\n".join("\t".join(str(x) for x in r) for r in D)
def process(lines): """ @param lines: lines of a .hud file """ names, data = hud.decode(lines) out = StringIO() for i, genotype in enumerate(data[0]): name = 'SNP_%d' % i chromosome = '1' morgans = '0.0' bases = i+1 row = [name, chromosome, morgans, bases] print >> out, '\t'.join(str(x) for x in row) return out.getvalue().rstrip()
def process(lines): """ @param lines: lines of a .hud file """ names, data = hud.decode(lines) out = StringIO() for i, genotype in enumerate(data[0]): name = 'SNP_%d' % i chromosome = '1' morgans = '0.0' bases = i + 1 row = [name, chromosome, morgans, bases] print >> out, '\t'.join(str(x) for x in row) return out.getvalue().rstrip()
def get_response_content(fs): # get the headers and data from all of the input sources headers, sequences = hud.decode(fs.hud.splitlines()) h_to_s = dict((h, s) for h, s in zip(headers, sequences)) headers_out = [] sequences_out = [] for p, hs in process_headers(headers): headers_out.append(p) data = np.vstack(h_to_s[h] for h in hs).sum(axis=0) if fs.combine_exist: data = np.minimum(1, data) sequences_out.append(data) if fs.remove_invariant: sequences_out = remove_invariant_columns(sequences_out) return hud.encode(headers_out, sequences_out) + '\n'
def get_response_content(fs): out = StringIO() # extract names from the .hud file names, hud_data = hud.decode(fs.hud.splitlines()) # read the csv file rows = list(csv.reader(Util.get_stripped_lines(fs.info.splitlines()))) header, data_rows = rows[0], rows[1:] cases, controls = get_precipitation_info(data_rows, fs.threshold) # write the .ind file contents for name in names: gender = 'U' classification = 'Ignore' if name in cases: classification = 'Case' elif name in controls: classification = 'Control' row = [name, gender, classification] print >> out, '\t'.join(row) return out.getvalue()
def get_response_content(fs): out = StringIO() # extract name order from the .hud file names, hud_data = hud.decode(fs.hud.splitlines()) # read the csv file rows = list(csv.reader(Util.get_stripped_lines(fs.info.splitlines()))) header, data_rows = rows[0], rows[1:] cases, controls = get_temperature_info(data_rows, fs.threshold) # write the .ind file contents for name in names: gender = 'U' classification = 'Ignore' if name in cases: classification = 'Case' elif name in controls: classification = 'Control' row = [name, gender, classification] print >> out, '\t'.join(row) return out.getvalue()
def process(args, raw_hud_lines): """ @param args: user options from the web or cmdline @param hud_lines: raw lines of a .hud file @return: results in convenient text form """ out = StringIO() names, data = hud.decode(raw_hud_lines) C_full = np.array(data, dtype=float) pcs = eigenpop.get_scaled_eigenvectors(C_full, args.diploid_and_biallelic) # check for sufficient number of eigenvectors if len(pcs) < args.ncoords: raise ValueError('the number of requested principal components ' 'must be no more than the number of OTUs') # compute the correlation of each SNP vector with each principal PC mylist = [] for snp in C_full.T: row = [mycorr(snp, pc) for pc in pcs[:args.ncoords]] mylist.append(row) np.set_printoptions(linewidth=300, threshold=10000) return str(np.array(mylist))
def process(args, raw_hud_lines): """ @param args: user options from the web or cmdline @param hud_lines: raw lines of a .hud file @return: results in convenient text form """ out = StringIO() names, data = hud.decode(raw_hud_lines) C_full = np.array(data, dtype=float) pcs = eigenpop.get_scaled_eigenvectors(C_full, args.diploid_and_biallelic) # check for sufficient number of eigenvectors if len(pcs) < args.ncoords: raise ValueError( 'the number of requested principal components ' 'must be no more than the number of OTUs') # compute the correlation of each SNP vector with each principal PC mylist = [] for snp in C_full.T: row = [mycorr(snp, pc) for pc in pcs[:args.ncoords]] mylist.append(row) np.set_printoptions(linewidth=300, threshold=10000) return str(np.array(mylist))
def process(hud_lines, info_lines, location): """ @param hud_lines: lines of a .hud file @param info_lines: lines of a phenotype .csv file @param location: the control location string """ out = StringIO() # extract name order from the .hud file names, hud_data = hud.decode(hud_lines) # read the csv file rows = list(csv.reader(info_lines)) header, data_rows = rows[0], rows[1:] cases, controls = get_location_info(data_rows, location) # write the .ind file contents for name in names: gender = 'U' classification = 'Ignore' if name in cases: classification = 'Case' elif name in controls: classification = 'Control' row = [name, gender, classification] print >> out, '\t'.join(row) return out.getvalue().rstrip()
def process(args, hud_lines): """ @param hud_lines: lines of a .hud file @return: results in convenient text form """ out = StringIO() # get the ordered names from the .hud file names, data = hud.decode(hud_lines) # create the floating point count matrix C_full = np.array(data) m_full, n_full = C_full.shape # remove invariant columns C = np.vstack([v for v in C_full.T if len(set(v)) > 1]).T # get the shape of the matrix m, n = C.shape # get the column means u = C.mean(axis=0) # get the centered and normalized counts matrix M = (C - u) / np.sqrt(u * (1 - u)) # construct the sample covariance matrix X = np.dot(M, M.T) / n # get the eigendecomposition of the covariance matrix evals, evecs = EigUtil.eigh(X) L1 = evals.sum() L2 = np.dot(evals, evals) proportion = evals[0] / L1 # compute the relative size of the first eigenvalue L = m * proportion # compute the Tracy-Widom statistic x = get_tracy_widom_statistic(m, n, L) # do linkage correction n_prime = ((m + 1) * L1 * L1) / ((m - 1) * L2 - L1 * L1) # detect additional structure using alpha level of 0.05 crit = 0.9794 if n_prime < n: L_prime = (m - 1) * proportion x_prime = get_tracy_widom_statistic(m, n_prime, L_prime) sigs, insig = get_corrected_structure(crit, evals, m, n_prime) else: sigs, insig = get_corrected_structure(crit, evals, m, n) # print some infos print >> out, 'number of isolates:' print >> out, m_full print >> out print >> out, 'total number of SNPs:' print >> out, n_full print >> out print >> out, 'number of informative SNPs:' print >> out, n print >> out print >> out, 'effective number of linkage-corrected SNPs:' if n_prime < n: print >> out, n_prime else: print >> out, '[sample is too degenerate for estimation]' print >> out print >> out, 'Tracy-Widom statistic (linkage-naive):' print >> out, x print >> out print >> out, 'Tracy-Widom statistic (linkage-corrected):' if n_prime < n: print >> out, x_prime else: print >> out, '[sample is too degenerate for estimation]' print >> out print >> out, 'proportion of variance explained by principal axis:' print >> out, proportion print >> out print >> out, 'number of significant axes of variation:' print >> out, len(sigs) print >> out print >> out, 'significant Tracy-Widom statistics:' for sig in sigs: print >> out, sig print >> out print >> out, 'first insignificant Tracy-Widom statistic:' print >> out, insig print >> out print >> out, 'principal axis projection:' for loading, name in sorted(zip(evecs[0] * evals[0], names)): print >> out, '\t'.join([name, str(loading)]) print >> out # evals should sum to the number of OTUs evals_sum = sum(evals) if args.sum_to_n: print >> out, 'eigenvalues normalized to sum to the number of OTUs:' for w in evals: print >> out, m_full * w / float(evals_sum) elif args.sum_to_1: print >> out, 'eigenvalues normalized to sum to 1.0:' for w in evals: print >> out, w / float(evals_sum) return out.getvalue().rstrip()
def process(args, hud_lines): """ @param hud_lines: lines of a .hud file @return: results in convenient text form """ out = StringIO() # get the ordered names from the .hud file names, data = hud.decode(hud_lines) # create the floating point count matrix C_full = np.array(data) m_full, n_full = C_full.shape # remove invariant columns C = np.vstack([v for v in C_full.T if len(set(v))>1]).T # get the shape of the matrix m, n = C.shape # get the column means u = C.mean(axis=0) # get the centered and normalized counts matrix M = (C - u) / np.sqrt(u * (1 - u)) # construct the sample covariance matrix X = np.dot(M, M.T) / n # get the eigendecomposition of the covariance matrix evals, evecs = EigUtil.eigh(X) L1 = evals.sum() L2 = np.dot(evals, evals) proportion = evals[0] / L1 # compute the relative size of the first eigenvalue L = m*proportion # compute the Tracy-Widom statistic x = get_tracy_widom_statistic(m, n, L) # do linkage correction n_prime = ((m+1)*L1*L1) / ((m-1)*L2 - L1*L1) # detect additional structure using alpha level of 0.05 crit = 0.9794 if n_prime < n: L_prime = (m-1)*proportion x_prime = get_tracy_widom_statistic(m, n_prime, L_prime) sigs, insig = get_corrected_structure(crit, evals, m, n_prime) else: sigs, insig = get_corrected_structure(crit, evals, m, n) # print some infos print >> out, 'number of isolates:' print >> out, m_full print >> out print >> out, 'total number of SNPs:' print >> out, n_full print >> out print >> out, 'number of informative SNPs:' print >> out, n print >> out print >> out, 'effective number of linkage-corrected SNPs:' if n_prime < n: print >> out, n_prime else: print >> out, '[sample is too degenerate for estimation]' print >> out print >> out, 'Tracy-Widom statistic (linkage-naive):' print >> out, x print >> out print >> out, 'Tracy-Widom statistic (linkage-corrected):' if n_prime < n: print >> out, x_prime else: print >> out, '[sample is too degenerate for estimation]' print >> out print >> out, 'proportion of variance explained by principal axis:' print >> out, proportion print >> out print >> out, 'number of significant axes of variation:' print >> out, len(sigs) print >> out print >> out, 'significant Tracy-Widom statistics:' for sig in sigs: print >> out, sig print >> out print >> out, 'first insignificant Tracy-Widom statistic:' print >> out, insig print >> out print >> out, 'principal axis projection:' for loading, name in sorted(zip(evecs[0] * evals[0], names)): print >> out, '\t'.join([name, str(loading)]) print >> out # evals should sum to the number of OTUs evals_sum = sum(evals) if args.sum_to_n: print >> out, 'eigenvalues normalized to sum to the number of OTUs:' for w in evals: print >> out, m_full * w / float(evals_sum) elif args.sum_to_1: print >> out, 'eigenvalues normalized to sum to 1.0:' for w in evals: print >> out, w / float(evals_sum) return out.getvalue().rstrip()
def get_response_content(fs): headers, data_rows = hud.decode(fs.table.splitlines()) sequences = [''.join(str(x) for x in row) for row in data_rows] return Phylip.encode(headers, sequences)
def process(raw_hud_lines): names, data = hud.decode(raw_hud_lines) columns = zip(*data) return '\n'.join(''.join(str(x) for x in c) for c in columns)