def _test_stdin_input(self, input_file): to = os.path.join(self._homedir, "original") ts = os.path.join(self._homedir, "stdin") self.run_command([input_file, to, "-q"]) with open(input_file, "rb") as f: s = self.run_command(["-", ts], stdin=f) with wt.open_table(to) as t1: with wt.open_table(ts) as t2: self.assert_tables_equal(t1, t2) shutil.rmtree(self._homedir) os.mkdir(self._homedir)
def _test_gzipped_input(self, input_file): original = os.path.join(self._homedir, "original") zipped = os.path.join(self._homedir, "zipped") self.run_command([input_file, original, "-q"]) zgtf = os.path.join(self._homedir, "gtf.gz") z = gzip.open(zgtf, "wb") with open(input_file, "rb") as f: z.write(f.read()) z.close() self.run_command([zgtf, zipped, "-qf"]) with wt.open_table(original) as t1: with wt.open_table(zipped) as t2: self.assert_tables_equal(t1, t2) shutil.rmtree(self._homedir) os.mkdir(self._homedir)
def hq_snps_bygt(homedir, sample, gt, minq, cols): t = wt.open_table(homedir) i = t.open_index("{0}.GT+QUAL[1]".format(sample)) start = (gt, minq) stop = (gt, i.max_key(gt)[1] + 1) for row in i.cursor(cols, start=start, stop=stop): print("\t".join([str(i) for i in row]))
def load_wt_file( wt_file, individuals=[] ): """ loads wormtable file """ genotypes = [ind+'.GT' for ind in individuals ] table = wormtable.open_table( wt_file ) tc = table.cursor( genotypes ) return tc, individuals
def main(): parser = argparse.ArgumentParser(description=globals()['__doc__']) parser.add_argument('cols', default="CHROM,POS", help='comma separated column names to print') parser.add_argument( '-i', default='i', choices=['i', 'e', 'f'], help='indel mode: i=include, e=exclude, f=find [default=i]') parser.add_argument( '-f', help= 'specify semicolon separated filters as COLUMN(>=|<=|>|<|==|!=)VALUE,\ e.g. "QUAL>20;SAMPLE.GT==0/0"') parser.add_argument( '-r', help='region, e.g. 1:300-500 (start and end inclusive)') parser.add_argument('homedir', help='home directory of database') args = vars(parser.parse_args()) with wt.open_table(args['homedir']) as t, t.open_index("CHROM+POS") as i: for row in snp_filter(t, i, args): print('\t'.join([str(x) for x in row]))
def retrieve_variants_by_rowid(inp_folder, ids, out_file): """ Use the row IDs in ids to query the complete wormtable (containing all variant fields) and return all the information about the filtered variants. """ # open table and load indices table = wt.open_table(inp_folder + '/schema.wt', db_cache_size='4G') index = table.open_index('row_id') # retrieve the rows using the 'row_id' field and write the results in out_file col_names = [col.get_name() for col in table.columns()] row_id_idx = col_names.index('row_id') out = open(out_file, 'w') out.write('\t'.join(col_names) + '\n') for row in index.cursor(col_names): if row[row_id_idx] in ids: to_write = list() for value in row: try: # value is a number (int or float) to_write.append(int(value)) except TypeError, e: # value is a tuple if value is not None: to_write.append(','.join([str(x) for x in value])) else: to_write.append(None) except ValueError, e: # value is a string to_write.append(value) except:
def get_variants_assoc_to_gene_set(inp_folder, genes, field_name, negative_query): """ Open the field_name wormtable (assumed to be named 'inp_folder/field_name.wt') within inp_folder and return a set of all row IDs where at least one gene from genes is found. If negative_query is True, only variants NOT containing any of the input genes in field_name will be returned; if False, viceversa (positive query is run). """ # open wormtable for the field of interest table = wt.open_table(inp_folder + '/' + field_name + '.wt', db_cache_size='4G') all_ids = set() pos_ids = set() # NOTE: it assumes the wormtable has only two columns: 'row_id' and field_name row_id_idx = 0 field_name_idx = 1 for row in table.cursor(['row_id', field_name]): all_ids.add(row[row_id_idx]) for value in row[field_name_idx].split(','): for gene in genes: if value.find(gene) != -1: pos_ids.add(row[row_id_idx]) break # close table table.close() # if "negative_query" is True, return all row IDs which are not in "pos_ids" if negative_query == 'True': neg_ids = all_ids - pos_ids return neg_ids elif negative_query == 'False': return pos_ids
def get_total_variant_count(out_folder): """ Get the total (initial) number of variants. """ tbl = wt.open_table(os.path.join(out_folder, 'schema.wt')) return len(tbl)
def load_wt_file(wt_file, individuals=[]): """ loads wormtable file """ genotypes = [ind + '.GT' for ind in individuals] table = wormtable.open_table(wt_file) tc = table.cursor(genotypes) return tc, individuals
def __init__(self, variantSetId, wtDir): """ Allocates a new WormtableDataset with the specified variantSetId based on the specified wormtable directory. """ self._variantSetId = variantSetId self._wtDir = wtDir self._table = wt.open_table(wtDir) self._chromPosIndex = self._table.open_index("CHROM+POS") self._chromIdIndex = self._table.open_index("CHROM+ID") self._sampleCols = {} self._infoCols = [] self._firstSamplePosition = -1 cols = self._table.columns()[self.FILTER_COL + 1:] # We build lookup tables for the INFO and sample columns so they can # be easily found during conversion. For the sample columns we make # a dictionary mapping the sample name to a the list of (name, col) # tuples for that sample. for c in cols: colName = c.get_name() if colName.startswith("INFO"): s = colName.split(".")[1] self._infoCols.append((s, c)) # We assume the .GT is the first column for each sample elif colName.endswith(".GT"): s = colName.split(".")[0] self._sampleCols[s] = [(c, "GT")] if self._firstSamplePosition == -1: self._firstSamplePosition = c.get_position() else: # This must be a sample specific column s = colName.split(".") self._sampleCols[s[0]].append((c, s[1]))
def filter_variants(inp_folder, genotype, samples_list): """ Open all wormtables (assumed to be named 'inp_folder/sample_name_GT.wt') in inp_folder corresponding to the specified samples and filter or discard variants according to the specified genotype. The row_id value of each filtered variant is stored in the set ids, which is returned. It works also for non-diploid genotypes. Non-informative genotypes (e.g. './.' are skipped. """ # sample_ids has sample names as keys and id sets as values samples_ids = dict() for sample in samples_list: # open wormtable for the field of interest table = wt.open_table(sample, db_cache_size='4G') # retrieve rows matching genotype in samples_list and store their row_id ids = set() row_id_idx = 0 sample_idx = 1 for row in table: gen = row[sample_idx].replace('/','').replace('|','') # note: gen == len(gen)*gen[0] to check if all the characters in a # string are the same is even faster than count()! if gen == len(gen)*gen[0] and gen[0] != '.': if gen[0] == '0' and genotype == 'homref': ids.add(row[row_id_idx]) elif gen[0] != '0' and genotype == 'homalt': ids.add(row[row_id_idx]) elif gen != len(gen)*gen[0] and genotype == 'het' and gen[0] != '.': ids.add(row[row_id_idx]) # close table and store results table.close() samples_ids[sample] = ids return samples_ids
def filter_variants(inp_folder, genotype, samples_list): """ Open all wormtables (assumed to be named 'inp_folder/sample_name_GT.wt') in inp_folder corresponding to the specified samples and filter or discard variants according to the specified genotype. The row_id value of each filtered variant is stored in the set ids, which is returned. It works also for non-diploid genotypes. Non-informative genotypes (e.g. './.' are skipped. """ # sample_ids has sample names as keys and id sets as values samples_ids = dict() for sample in samples_list: # open wormtable for the field of interest table = wt.open_table(sample, db_cache_size='4G') # retrieve rows matching genotype in samples_list and store their row_id ids = set() row_id_idx = 0 sample_idx = 1 for row in table: gen = row[sample_idx].replace('/', '').replace('|', '') # note: gen == len(gen)*gen[0] to check if all the characters in a # string are the same is even faster than count()! if gen == len(gen) * gen[0] and gen[0] != '.': if gen[0] == '0' and genotype == 'homref': ids.add(row[row_id_idx]) elif gen[0] != '0' and genotype == 'homalt': ids.add(row[row_id_idx]) elif gen != len( gen) * gen[0] and genotype == 'het' and gen[0] != '.': ids.add(row[row_id_idx]) # close table and store results table.close() samples_ids[sample] = ids return samples_ids
def get_variants_of_given_type_from_previous_results(inp_folder, var_type, previous_results): """ Open the REF+ALT wormtable (assumed to be named 'inp_folder/REF+ALT.wt') within inp_folder and return a set of all row IDs correspoding to var_type. Use ids from previous_results as starting point to further filter the data and to make it faster. """ # extract row IDs to check from previous_results (which is a file path) and # store them in a set; NOTE: it assumes previous_results has a 1-line header, # is tab-separated and row_id is the left-most field! ids_to_check = set() f = open(previous_results) header = True for line in f: if header: header = False else: ids_to_check.add(int(line.split('\t')[0])) f.close() # open REF+ALT wormtable table = wt.open_table(inp_folder + '/REF+ALT.wt', db_cache_size='4G') index = table.open_index('row_id') # retrieve rows matching 'var_type' ids = set() # NOTE: it assumes the wormtable has three columns: 'row_id', 'REF', 'ALT' row_id_idx = 0 ref_idx = 1 alt_idx = 2 if var_type == 'SNPs': for row in index.cursor(['row_id', 'REF', 'ALT']): if row[row_id_idx] in ids_to_check: for alt in row[alt_idx].split(','): if len(row[ref_idx]) == 1 and len(alt) == 1: ids.add(row[row_id_idx]) break elif var_type == 'InDels': for row in index.cursor(['row_id', 'REF', 'ALT']): if row[row_id_idx] in ids_to_check: for alt in row[alt_idx].split(','): if len(row[ref_idx]) != len(alt): ids.add(row[row_id_idx]) break elif var_type == 'MNPs': for row in index.cursor(['row_id', 'REF', 'ALT']): if row[row_id_idx] in ids_to_check: for alt in row[alt_idx].split(','): if len(row[ref_idx]) > 1 and len(row[ref_idx]) == len(alt): ids.add(row[row_id_idx]) break else: sys.stderr.write("\nVariant type not properly defined.\n") sys.exit() # close table and index table.close() index.close() return ids
def filter_variants_from_previous_results(inp_folder, genotype, samples_list, previous_results): """ Open all wormtables (assumed to be named 'inp_folder/sample_name_GT.wt') in inp_folder corresponding to the specified samples and filter or discard variants according to the specified genotype. The row_id value of each filtered variant is stored in the set ids, which is returned. Use ids from previous_results as starting point to further filter the data and to make it faster. It works also for non-diploid genotypes. Non-informative genotypes (e.g. './.' are skipped. """ # extract row IDs to check from previous_results (which is a file path) and # store them in a set; NOTE: it assumes previous_results has a 1-line header, # is tab-separated and row_id is the left-most field! ids_to_check = set() f = open(previous_results) header = True for line in f: if header: header = False else: ids_to_check.add(int(line.split('\t')[0])) f.close() # sample_ids has sample names as keys and id sets as values samples_ids = dict() for sample in samples_list: # open wormtable for the field of interest table = wt.open_table(sample, db_cache_size='4G') index = table.open_index('row_id') # retrieve rows matching genotype in samples_list and store their row_id ids = set() row_id_idx = 0 sample_idx = 1 for row in index.cursor( ['row_id', os.path.basename(sample).replace('.wt', '')]): # only analyse row if row_id is among the ones in ids_to_check if row[row_id_idx] in ids_to_check: gen = row[sample_idx].replace('/', '').replace('|', '') # note: gen == len(gen)*gen[0] to check if all the characters in a # string are the same is even faster than count()! if gen == len(gen) * gen[0] and gen[0] != '.': if gen[0] == '0' and genotype == 'homref': ids.add(row[row_id_idx]) elif gen[0] != '0' and genotype == 'homalt': ids.add(row[row_id_idx]) elif gen != len( gen) * gen[0] and genotype == 'het' and gen[0] != '.': ids.add(row[row_id_idx]) # close table and store results table.close() index.close() samples_ids[sample] = ids return samples_ids
def filter_variants_from_previous_results(inp_folder, genotype, samples_list, previous_results): """ Open all wormtables (assumed to be named 'inp_folder/sample_name_GT.wt') in inp_folder corresponding to the specified samples and filter or discard variants according to the specified genotype. The row_id value of each filtered variant is stored in the set ids, which is returned. Use ids from previous_results as starting point to further filter the data and to make it faster. It works also for non-diploid genotypes. Non-informative genotypes (e.g. './.' are skipped. """ # extract row IDs to check from previous_results (which is a file path) and # store them in a set; NOTE: it assumes previous_results has a 1-line header, # is tab-separated and row_id is the left-most field! ids_to_check = set() f = open(previous_results) header = True for line in f: if header: header = False else: ids_to_check.add(int(line.split('\t')[0])) f.close() # sample_ids has sample names as keys and id sets as values samples_ids = dict() for sample in samples_list: # open wormtable for the field of interest table = wt.open_table(sample, db_cache_size='4G') index = table.open_index('row_id') # retrieve rows matching genotype in samples_list and store their row_id ids = set() row_id_idx = 0 sample_idx = 1 for row in index.cursor(['row_id', os.path.basename(sample).replace('.wt', '')]): # only analyse row if row_id is among the ones in ids_to_check if row[row_id_idx] in ids_to_check: gen = row[sample_idx].replace('/','').replace('|','') # note: gen == len(gen)*gen[0] to check if all the characters in a # string are the same is even faster than count()! if gen == len(gen)*gen[0] and gen[0] != '.': if gen[0] == '0' and genotype == 'homref': ids.add(row[row_id_idx]) elif gen[0] != '0' and genotype == 'homalt': ids.add(row[row_id_idx]) elif gen != len(gen)*gen[0] and genotype == 'het' and gen[0] != '.': ids.add(row[row_id_idx]) # close table and store results table.close() index.close() samples_ids[sample] = ids return samples_ids
def setUp(self): global _wormtableTestFixture self._dataDir = _wormtableTestFixture.dataDir self._tables = {} self._chromIndexes = {} self._chromPosIndexes = {} for f in os.listdir(self._dataDir): t = wt.open_table(os.path.join(self._dataDir, f)) self._tables[f] = t self._chromIndexes[f] = t.open_index("CHROM") self._chromPosIndexes[f] = t.open_index("CHROM+POS") self._backend = server.WormtableBackend(self._dataDir)
def setUp(self): global _wormtableTestFixture self._dataDir = _wormtableTestFixture.dataDir self._tables = {} self._chromIndexes = {} self._chromPosIndexes = {} for relativePath in os.listdir(self._dataDir): table = wt.open_table(os.path.join(self._dataDir, relativePath)) self._tables[relativePath] = table self._chromIndexes[relativePath] = table.open_index("CHROM") self._chromPosIndexes[relativePath] = table.open_index("CHROM+POS") self._backend = backend.Backend( self._dataDir, variants.WormtableVariantSet)
def count_Ts_Tv_wtcursor(homedir): """ Count number of transitions and transversions using wormtable and an index on CHROM+POS, counting Ts and Tv row by row """ with wt.open_table(homedir) as t: Ts, Tv = 0, 0 for ref, alt in t.cursor(["REF", "ALT"]): if ref != alt and ref in bases and alt in bases: if bases[ref] == bases[alt]: Ts +=1 else: Tv +=1 return Ts, Tv
def count_Ts_Tv_wtindex(homedir): """ Count number of of transitions and transversions using wormtable and an index on REF+ALT """ with wt.open_table(homedir) as t, t.open_index("REF+ALT") as i: Ts, Tv = 0, 0 c = i.counter() for s in permutations(bases.keys(), 2): if bases[s[0]] == bases[s[1]]: Ts += c[s] else: Tv += c[s] return Ts, Tv
def test_open_api(self): """ Tests the open_table/index api to ensure everything works correctly. """ t = wt.Table(self._homedir) t.add_id_column() t.add_uint_column("u1") t.open("w") self.assertTrue(t.is_open()) t.close() # open_table returns a table opened self.assertFalse(t.is_open()) t = wt.open_table(self._homedir) self.assertTrue(t.is_open()) t.close() self.assertFalse(t.is_open()) # try now with the context manager with wt.open_table(self._homedir) as t: self.assertTrue(t.is_open()) self.assertFalse(t.is_open()) # Now do the same for an index. t = wt.open_table(self._homedir) name = "test" i = wt.Index(t, name) i.add_key_column(t.get_column(1)) i.open("w") i.build() i.close() # The index is built, so we can open it. i = t.open_index(name) self.assertTrue(i.is_open()) i.close() self.assertFalse(i.is_open()) with t.open_index(name) as i: self.assertTrue(i.is_open()) self.assertFalse(i.is_open()) t.close()
def get_variants_assoc_to_gene_set_from_previous_results(inp_folder, genes, field_name, negative_query, previous_results): """ Open the field_name wormtable (assumed to be named 'inp_folder/field_name.wt') within inp_folder and return a set of all row IDs where at least one gene from genes is found. Use ids from previous_results as starting point to further filter the data and to make it faster. If negative_query is True, only variants NOT containing any of the input genes in field_name will be returned; if False, viceversa (positive query is run). """ # extract row IDs to check from previous_results (which is a file path) and # store them in a set; NOTE: it assumes previous_results has a 1-line header, # is tab-separated and row_id is the left-most field! ids_to_check = set() f = open(previous_results) header = True for line in f: if header: header = False else: ids_to_check.add(int(line.split('\t')[0])) f.close() # open wormtable for the field of interest table = wt.open_table(inp_folder + '/' + field_name + '.wt', db_cache_size='4G') index = table.open_index('row_id') all_ids = set() pos_ids = set() # NOTE: it assumes the wormtable has only two columns: 'row_id' and field_name row_id_idx = 0 field_name_idx = 1 for row in index.cursor(['row_id', field_name]): if row[row_id_idx] in ids_to_check: all_ids.add(row[row_id_idx]) for value in row[field_name_idx].split(','): for gene in genes: if value.find(gene) != -1: pos_ids.add(row[row_id_idx]) break # close table and index table.close() index.close() # if "negative_query" is True, return all row IDs which are not in "pos_ids" if negative_query == 'True': neg_ids = all_ids - pos_ids return neg_ids elif negative_query == 'False': return pos_ids
def get_variants_assoc_to_gene_set_from_previous_results( inp_folder, genes, field_name, negative_query, previous_results): """ Open the field_name wormtable (assumed to be named 'inp_folder/field_name.wt') within inp_folder and return a set of all row IDs where at least one gene from genes is found. Use ids from previous_results as starting point to further filter the data and to make it faster. If negative_query is True, only variants NOT containing any of the input genes in field_name will be returned; if False, viceversa (positive query is run). """ # extract row IDs to check from previous_results (which is a file path) and # store them in a set; NOTE: it assumes previous_results has a 1-line header, # is tab-separated and row_id is the left-most field! ids_to_check = set() f = open(previous_results) header = True for line in f: if header: header = False else: ids_to_check.add(int(line.split('\t')[0])) f.close() # open wormtable for the field of interest table = wt.open_table(inp_folder + '/' + field_name + '.wt', db_cache_size='4G') index = table.open_index('row_id') all_ids = set() pos_ids = set() # NOTE: it assumes the wormtable has only two columns: 'row_id' and field_name row_id_idx = 0 field_name_idx = 1 for row in index.cursor(['row_id', field_name]): if row[row_id_idx] in ids_to_check: all_ids.add(row[row_id_idx]) for value in row[field_name_idx].split(','): for gene in genes: if value.find(gene) != -1: pos_ids.add(row[row_id_idx]) break # close table and index table.close() index.close() # if "negative_query" is True, return all row IDs which are not in "pos_ids" if negative_query == 'True': neg_ids = all_ids - pos_ids return neg_ids elif negative_query == 'False': return pos_ids
def setUp(self): super(VcfBuildTest, self).setUp() vcf = self.get_vcf() self.run_command([vcf, self._homedir, "-qf"]) self._table = wt.open_table(self._homedir) # get some simple information about the VCF self._num_rows = 0 self._info_cols = 0 f = open(vcf, "r") for l in f: if l.startswith("#"): if l.startswith("##INFO"): self._info_cols += 1 else: self._num_rows += 1
def count_Ts_Tv(homedir): """ Count number of of transitions and transversions using an index on REF+ALT """ subs = [p for p in permutations([b'A',b'C',b'G',b'T'], 2)] bases = {b'A':'purine', b'G':'purine', b'C':'pyrimidine', b'T':'pyrimidine'} t = wt.open_table(homedir) i = t.open_index("REF+ALT") Ts, Tv = 0, 0 c = i.counter() for s in subs: if bases[s[0]] == bases[s[1]]: Ts += c[s] else: Tv += c[s] i.close() t.close() return Ts, Tv
def setUp(self): super(GtfBuildTest, self).setUp() gtf = self.get_gtf() self.run_command([gtf, self._homedir, "-qf"]) self._table = wt.open_table(self._homedir) self._columns = [ SEQNAME, SOURCE, FEATURE, START, END, SCORE, STRAND, FRAME, GENE_ID, TRANSCRIPT_ID ] # parse the file self._num_rows = 0 f = open(gtf, "r") self._rows = [] for l in f: row = {} tokens = l.split("\t") row[SEQNAME] = tokens[0] row[SOURCE] = tokens[1] row[FEATURE] = tokens[2] row[START] = int(tokens[3]) row[END] = int(tokens[4]) tok = tokens[5] row[SCORE] = float(tok) if tok != "." else None row[STRAND] = tokens[6] tok = tokens[7] row[FRAME] = int(tok) if tok != "." else None attrs = tokens[8].split(";") d = {} for s in attrs: spl = s.split() if len(spl) > 0: d[spl[0]] = spl[1].strip("\"") row[GENE_ID] = d[GENE_ID] row[TRANSCRIPT_ID] = d[TRANSCRIPT_ID] # if the type is str, encode it. r = {} for k, v in row.items(): r[k] = v if isinstance(v, str): r[k] = v.encode() self._rows.append(r) self._num_rows += 1
def setUp(self): super(GtfBuildTest, self).setUp() gtf = self.get_gtf() self.run_command([gtf, self._homedir, "-qf"]) self._table = wt.open_table(self._homedir) self._columns = [SEQNAME, SOURCE, FEATURE, START, END, SCORE, STRAND, FRAME, GENE_ID, TRANSCRIPT_ID] # parse the file self._num_rows = 0 f = open(gtf, "r") self._rows = [] for l in f: row = {} tokens = l.split("\t") row[SEQNAME] = tokens[0] row[SOURCE] = tokens[1] row[FEATURE] = tokens[2] row[START] = int(tokens[3]) row[END] = int(tokens[4]) tok = tokens[5] row[SCORE] = float(tok) if tok != "." else None row[STRAND] = tokens[6] tok = tokens[7] row[FRAME] = int(tok) if tok != "." else None attrs = tokens[8].split(";") d = {} for s in attrs: spl = s.split() if len(spl) > 0: d[spl[0]] = spl[1].strip("\"") row[GENE_ID] = d[GENE_ID] row[TRANSCRIPT_ID] = d[TRANSCRIPT_ID] # if the type is str, encode it. r = {} for k, v in row.items(): r[k] = v if isinstance(v, str): r[k] = v.encode() self._rows.append(r) self._num_rows += 1
def get_variants_in_given_regions(inp_folder, chrom, start_pos, end_pos): """ Open the CHROM+POS wormtable (assumed to be named 'inp_folder/CHROM+POS.wt') within inp_folder and return a set of all row IDs correspoding to the region of interest. """ # open CHROM+POS wormtable table = wt.open_table(inp_folder + '/CHROM+POS.wt', db_cache_size='4G') index = table.open_index('CHROM+POS') # retrieve rows matching 'chrom' and whose pos. is between 'start' and 'end' ids = set() # NOTE: it assumes the wormtable has three columns: 'row_id', 'CHROM', 'POS' row_id_idx = 0 cols = ['row_id', 'CHROM', 'POS'] for row in index.cursor(cols, start=(chrom, start_pos), stop=(chrom, end_pos)): ids.add(row[row_id_idx]) # close table and index table.close() index.close() return ids
def get_variants_of_given_type(inp_folder, var_type): """ Open the REF+ALT wormtable (assumed to be named 'inp_folder/REF+ALT.wt') within inp_folder and return a set of all row IDs correspoding to var_type. """ # open REF+ALT wormtable table = wt.open_table(inp_folder + '/REF+ALT.wt', db_cache_size='4G') # retrieve rows matching 'var_type' ids = set() # NOTE: it assumes the wormtable has three columns: 'row_id', 'REF', 'ALT' row_id_idx = 0 ref_idx = 1 alt_idx = 2 if var_type == 'SNPs': for row in table.cursor(['row_id', 'REF', 'ALT']): for alt in row[alt_idx].split(','): if len(row[ref_idx]) == 1 and len(alt) == 1: ids.add(row[row_id_idx]) break elif var_type == 'InDels': for row in table.cursor(['row_id', 'REF', 'ALT']): for alt in row[alt_idx].split(','): if len(row[ref_idx]) != len(alt): ids.add(row[row_id_idx]) break elif var_type == 'MNPs': for row in table.cursor(['row_id', 'REF', 'ALT']): for alt in row[alt_idx].split(','): if len(row[ref_idx]) > 1 and len(row[ref_idx]) == len(alt): ids.add(row[row_id_idx]) break else: sys.stderr.write("\nVariant type not properly defined.\n") sys.exit() # close table table.close() return ids
def get_variants_in_given_regions_from_previous_results( inp_folder, chrom, start_pos, end_pos, previous_results): """ Open the CHROM+POS wormtable (assumed to be named 'inp_folder/CHROM+POS.wt') within inp_folder and return a set of all row IDs correspoding to the region of interest. Use ids from previous_results as starting point to further filter the data and to make it faster. """ # extract row IDs to check from previous_results (which is a file path) and # store them in a set; NOTE: it assumes previous_results has a 1-line header, # is tab-separated and row_id is the left-most field! ids_to_check = set() f = open(previous_results) header = True for line in f: if header: header = False else: ids_to_check.add(int(line.split('\t')[0])) f.close() # open CHROM+POS wormtable table = wt.open_table(inp_folder + '/CHROM+POS.wt', db_cache_size='4G') index = table.open_index('CHROM+POS') # retrieve rows matching 'chrom' and whose pos. is between 'start' and 'end' ids = set() # NOTE: it assumes the wormtable has three columns: 'row_id', 'CHROM', 'POS' row_id_idx = 0 cols = ['row_id', 'CHROM', 'POS'] for row in index.cursor(cols, start=(chrom, start_pos), stop=(chrom, end_pos)): if row[row_id_idx] in ids_to_check: ids.add(row[row_id_idx]) # close table and index table.close() index.close() return ids
def count_Ts_Tv(homedir): """ Count number of of transitions and transversions using an index on REF+ALT """ subs = [p for p in permutations([b'A', b'C', b'G', b'T'], 2)] bases = { b'A': 'purine', b'G': 'purine', b'C': 'pyrimidine', b'T': 'pyrimidine' } t = wt.open_table(homedir) i = t.open_index("REF+ALT") Ts, Tv = 0, 0 c = i.counter() for s in subs: if bases[s[0]] == bases[s[1]]: Ts += c[s] else: Tv += c[s] i.close() t.close() return Ts, Tv
def __init__(self, variantSetId, wtDir): """ Allocates a new WormtableVariantSet with the specified variantSetId based on the specified wormtable directory. """ self._variantSetId = variantSetId self._wtDir = wtDir self._table = wt.open_table(wtDir) self._chromPosIndex = self._table.open_index("CHROM+POS") self._chromIdIndex = self._table.open_index("CHROM+ID") self._sampleCols = {} self._sampleNames = [] self._infoCols = [] self._firstSamplePosition = -1 ctimeInMillis = int(os.path.getctime(wtDir) * 1000) # ctime is in seconds, and we want milliseconds since the epoch self._creationTime = ctimeInMillis self._updatedTime = ctimeInMillis cols = self._table.columns()[self.FILTER_COL + 1:] # We build lookup tables for the INFO and sample columns so they can # be easily found during conversion. For the sample columns we make # a dictionary mapping the sample name to a list of (sample name, col) # tuples for that sample. for col in cols: colName = col.get_name() if colName.startswith("INFO"): infoField = colName.split(".")[1] self._infoCols.append((infoField, col)) else: if self._firstSamplePosition == -1: # This must be a sample specific column self._firstSamplePosition = col.get_position() sampleName, infoName = colName.split(".") if sampleName not in self._sampleCols: self._sampleCols[sampleName] = [] self._sampleNames.append(sampleName) self._sampleCols[sampleName].append((infoName, col))
def get_variants_in_given_regions_from_previous_results(inp_folder, chrom, start_pos, end_pos, previous_results): """ Open the CHROM+POS wormtable (assumed to be named 'inp_folder/CHROM+POS.wt') within inp_folder and return a set of all row IDs correspoding to the region of interest. Use ids from previous_results as starting point to further filter the data and to make it faster. """ # extract row IDs to check from previous_results (which is a file path) and # store them in a set; NOTE: it assumes previous_results has a 1-line header, # is tab-separated and row_id is the left-most field! ids_to_check = set() f = open(previous_results) header = True for line in f: if header: header = False else: ids_to_check.add(int(line.split('\t')[0])) f.close() # open CHROM+POS wormtable table = wt.open_table(inp_folder + '/CHROM+POS.wt', db_cache_size='4G') index = table.open_index('CHROM+POS') # retrieve rows matching 'chrom' and whose pos. is between 'start' and 'end' ids = set() # NOTE: it assumes the wormtable has three columns: 'row_id', 'CHROM', 'POS' row_id_idx = 0 cols = ['row_id', 'CHROM', 'POS'] for row in index.cursor(cols, start=(chrom, start_pos), stop=(chrom, end_pos)): if row[row_id_idx] in ids_to_check: ids.add(row[row_id_idx]) # close table and index table.close() index.close() return ids
def __init__(self, variantSetId, wtDir): """ Allocates a new WormtableDataset with the specified variantSetId based on the specified wormtable directory. """ self._variantSetId = variantSetId self._wtDir = wtDir self._table = wt.open_table(wtDir) self._chromPosIndex = self._table.open_index("CHROM+POS") self._chromIdIndex = self._table.open_index("CHROM+ID") self._sampleCols = {} self._sampleNames = [] self._infoCols = [] self._firstSamplePosition = -1 t = int(os.path.getctime(wtDir) * 1000) # ctime is in seconds, and we want milliseconds since the epoch self._creationTime = t self._updatedTime = t cols = self._table.columns()[self.FILTER_COL + 1:] # We build lookup tables for the INFO and sample columns so they can # be easily found during conversion. For the sample columns we make # a dictionary mapping the sample name to a the list of (name, col) # tuples for that sample. for c in cols: colName = c.get_name() if colName.startswith("INFO"): s = colName.split(".")[1] self._infoCols.append((s, c)) else: if self._firstSamplePosition == -1: # This must be a sample specific column self._firstSamplePosition = c.get_position() sampleName, infoName = colName.split(".") if sampleName not in self._sampleCols: self._sampleCols[sampleName] = [] self._sampleNames.append(sampleName) self._sampleCols[sampleName].append((infoName, c))
def main(): parser = argparse.ArgumentParser(description=globals()['__doc__']) parser.add_argument('cols', default="CHROM,POS", help='comma separated column names to print') parser.add_argument('-i', default='i', choices=['i','e','f'], help='indel mode: i=include, e=exclude, f=find [default=i]') parser.add_argument('-f', help='specify semicolon separated filters as COLUMN(>=|<=|>|<|==|!=)VALUE,\ e.g. "QUAL>20;SAMPLE.GT==0/0"') parser.add_argument('-r', help='region, e.g. 1:300-500 (start and end inclusive)') parser.add_argument('homedir', help='home directory of database') args = vars(parser.parse_args()) with wt.open_table(args['homedir']) as t, t.open_index("CHROM+POS") as i: for row in snp_filter(t, i, args): print('\t'.join([str(x) for x in row]))
def hq_snps(homedir, minq, cols): with wt.open_table(homedir) as t, t.open_index("QUAL[1]") as i: cursor = t.cursor(cols, i) cursor.set_min(minq) for row in cursor: yield row
def filter_variants_from_previous_results(inp_folder, field_name, operator, cutoff, keep_novalue, previous_results): """ Open wormtable within inp_folder corresponding to field_name (assumed to be named 'inp_folder/field_name.wt') and filter or discard variants according to the specified cutoff. The row_id value of each filtered variant is stored in the set ids, which is returned. Use ids from previous_results as starting point to further filter the data and to make it faster. """ # extract row IDs to check from previous_results (which is a file path) and # store them in a set; NOTE: it assumes previous_results has a 1-line header, # is tab-separated and row_id is the left-most field! ids_to_check = set() f = open(previous_results) header = True for line in f: if header: header = False else: ids_to_check.add(int(line.split('\t')[0])) f.close() # open wormtable for the field of interest table = wt.open_table(inp_folder + '/' + field_name + '.wt', db_cache_size='4G') index = table.open_index('row_id') # retrieve rows passing the cutoff for field_name and store their row_id ids = set() # NOTE: it assumes the wormtable has only two columns: 'row_id' and field_name row_id_idx = 0 field_name_idx = 1 for row in index.cursor(['row_id', field_name]): # only analyse row if row_id is among the ones in ids_to_check if row[row_id_idx] in ids_to_check: # the type of the field value for the current row is 'NoneType', empty, # or 'nan' if row[field_name_idx] is None or row[field_name_idx] == '': if keep_novalue == 'True': ids.add(row[row_id_idx]) else: pass # the type of the field value for the current row is 'str' elif isinstance(row[field_name_idx], str): if operator == 'greater_than' or operator == 'less_than': # special case: NUM/NUM (which is recognised as string by wormtable) # solution: we check that the ratio NUM/NUM is >,<,= cutoff for value in row[field_name_idx].split(','): if value == '' or value == 'nan': if keep_novalue == 'True': ids.add(row[row_id_idx]) break elif value.find('/') != -1: if operator == 'greater_than': if float(value.split('/')[0])/float(value.split('/')[1]) > float(cutoff): ids.add(row[row_id_idx]) break elif operator == 'less_than': if float(value.split('/')[0])/float(value.split('/')[1]) < float(cutoff): ids.add(row[row_id_idx]) break else: sys.stderr.write('\nError: ' + operator + ' incompatible with' + ' field type (string).\n') sys.exit() elif operator == 'equal_to': for value in row[field_name_idx].split(','): if value == '' or value == 'nan': if keep_novalue == 'True': ids.add(row[row_id_idx]) break # special case: NUM/NUM (which is recognised as string by wormtable) # solution: we check that the ratio NUM/NUM is >,<,= cutoff elif value.find('/') != -1: if float(value.split('/')[0])/float(value.split('/')[1]) == float(cutoff): ids.add(row[row_id_idx]) break elif value == cutoff: ids.add(row[row_id_idx]) break elif operator == 'contains_keyword': for value in row[field_name_idx].split(','): if value == '' or value == 'nan': if keep_novalue == 'True': ids.add(row[row_id_idx]) break for keyword in set(cutoff.split(',')): if value.find(keyword) != -1: ids.add(row[row_id_idx]) break # the type of the field value for the current row is 'tuple' elif isinstance(row[field_name_idx], tuple): if operator == 'greater_than': for value in row[field_name_idx]: if math.isnan(value): if keep_novalue == 'True': ids.add(row[row_id_idx]) break elif value > float(cutoff): ids.add(row[row_id_idx]) break elif operator == 'less_than': for value in row[field_name_idx]: if math.isnan(value): if keep_novalue == 'True': ids.add(row[row_id_idx]) break elif value < float(cutoff): ids.add(row[row_id_idx]) break elif operator == 'equal_to': for value in row[field_name_idx]: if math.isnan(value): if keep_novalue == 'True': ids.add(row[row_id_idx]) break elif value == float(cutoff): ids.add(row[row_id_idx]) break elif operator == 'contains_keyword': for value in row[field_name_idx]: if math.isnan(value): if keep_novalue == 'True': ids.add(row[row_id_idx]) break for keyword in set(cutoff.split(',')): if value.find(keyword) != -1: ids.add(row[row_id_idx]) break # the type of the field value for the current row is 'int' or 'float' # this includes cases of string numbers (e.g. '1234') elif is_number(row[field_name_idx]): if math.isnan(row[field_name_idx]): if keep_novalue == 'True': ids.add(row[row_id_idx]) elif operator == 'greater_than': if row[field_name_idx] > float(cutoff): ids.add(row[row_id_idx]) elif operator == 'less_than': if row[field_name_idx] < float(cutoff): ids.add(row[row_id_idx]) elif operator == 'equal_to': if row[field_name_idx] == float(cutoff): ids.add(row[row_id_idx]) elif operator == 'contains_keyword': for keyword in set(cutoff.split(',')): if row[field_name_idx].find(keyword) != -1: ids.add(row[row_id_idx]) break # close table and index table.close() index.close() return ids
def filter_variants(inp_folder, field_name, operator, cutoff, keep_novalue): """ Open wormtable within inp_folder corresponding to field_name (assumed to be named 'inp_folder/field_name.wt') and filter or discard variants according to the specified cutoff. The row_id value of each filtered variant is stored in the set ids, which is returned. """ # open wormtable for the field of interest table = wt.open_table(inp_folder + '/' + field_name + '.wt', db_cache_size='4G') # retrieve rows passing the cutoff for field_name and store their row_id ids = set() # NOTE: it assumes the wormtable has only two columns: 'row_id' and field_name row_id_idx = 0 field_name_idx = 1 # NOTE: row is a tuple of row_id and field_name for row in table.cursor(['row_id', field_name]): # the type of the field value for the current row is 'NoneType', empty, # or 'nan' if row[field_name_idx] is None or row[field_name_idx] == '': if keep_novalue == 'True': ids.add(row[row_id_idx]) else: pass # the type of the field value for the current row is 'str' elif isinstance(row[field_name_idx], str): if operator == 'greater_than' or operator == 'less_than': # special case: NUM/NUM (which is recognised as string by wormtable) # solution: we check that the ratio NUM/NUM is >,<,= cutoff for value in row[field_name_idx].split(','): if value == '' or value == 'nan': if keep_novalue == 'True': ids.add(row[row_id_idx]) break elif value.find('/') != -1: if operator == 'greater_than': if float(value.split('/')[0])/float(value.split('/')[1]) > float(cutoff): ids.add(row[row_id_idx]) break elif operator == 'less_than': if float(value.split('/')[0])/float(value.split('/')[1]) < float(cutoff): ids.add(row[row_id_idx]) break else: sys.stderr.write('\nError: ' + operator + ' incompatible with' + ' field type (string).\n') sys.exit() elif operator == 'equal_to': for value in row[field_name_idx].split(','): if value == '' or value == 'nan': if keep_novalue == 'True': ids.add(row[row_id_idx]) break # special case: NUM/NUM (which is recognised as string by wormtable) # solution: we check that the ratio NUM/NUM is >,<,= cutoff elif value.find('/') != -1: if float(value.split('/')[0])/float(value.split('/')[1]) == float(cutoff): ids.add(row[row_id_idx]) break elif value == cutoff: ids.add(row[row_id_idx]) break elif operator == 'contains_keyword': for value in row[field_name_idx].split(','): if value == '' or value == 'nan': if keep_novalue == 'True': ids.add(row[row_id_idx]) break for keyword in set(cutoff.split(',')): if value.find(keyword) != -1: ids.add(row[row_id_idx]) break # the type of the field value for the current row is 'tuple' elif isinstance(row[field_name_idx], tuple): if operator == 'greater_than': for value in row[field_name_idx]: if math.isnan(value): if keep_novalue == 'True': ids.add(row[row_id_idx]) break elif value > float(cutoff): ids.add(row[row_id_idx]) break elif operator == 'less_than': for value in row[field_name_idx]: if math.isnan(value): if keep_novalue == 'True': ids.add(row[row_id_idx]) break elif value < float(cutoff): ids.add(row[row_id_idx]) break elif operator == 'equal_to': for value in row[field_name_idx]: if math.isnan(value): if keep_novalue == 'True': ids.add(row[row_id_idx]) break elif value == float(cutoff): ids.add(row[row_id_idx]) break elif operator == 'contains_keyword': for value in row[field_name_idx]: if math.isnan(value): if keep_novalue == 'True': ids.add(row[row_id_idx]) break for keyword in set(cutoff.split(',')): if value.find(keyword) != -1: ids.add(row[row_id_idx]) break # the type of the field value for the current row is 'int' or 'float' # this includes cases of string numbers (e.g. '1234') elif is_number(row[field_name_idx]): if math.isnan(row[field_name_idx]): if keep_novalue == 'True': ids.add(row[row_id_idx]) elif operator == 'greater_than': if row[field_name_idx] > float(cutoff): ids.add(row[row_id_idx]) elif operator == 'less_than': if row[field_name_idx] < float(cutoff): ids.add(row[row_id_idx]) elif operator == 'equal_to': if row[field_name_idx] == float(cutoff): ids.add(row[row_id_idx]) elif operator == 'contains_keyword': for keyword in set(cutoff.split(',')): if row[field_name_idx].find(keyword) != -1: ids.add(row[row_id_idx]) break # close table table.close() return ids
def filter_variants(inp_folder, field_name, operator, cutoff, keep_novalue): """ Open wormtable within inp_folder corresponding to field_name (assumed to be named 'inp_folder/field_name.wt') and filter or discard variants according to the specified cutoff. The row_id value of each filtered variant is stored in the set ids, which is returned. NOTE: if operator is 'is_present' or 'is_absent', cutoff and keep_novalue will be ignored! """ # open wormtable for the field of interest table = wt.open_table(inp_folder + '/' + field_name + '.wt', db_cache_size='4G') # retrieve rows passing the cutoff for field_name and store their row_id ids = set() # NOTE: it assumes the wormtable has only two columns: 'row_id' and field_name row_id_idx = 0 field_name_idx = 1 # NOTE: row is a tuple of row_id and field_name for row in table.cursor(['row_id', field_name]): # the type of the field value for the current row is 'NoneType', empty, # or 'nan' if row[field_name_idx] is None or row[field_name_idx] == '': if operator == 'is_absent': ids.add(row[row_id_idx]) if keep_novalue == 'True' and operator != 'is_present': ids.add(row[row_id_idx]) else: pass # the type of the field value for the current row is 'str' elif isinstance(row[field_name_idx], str): # with "is_present", report variant if at least one value exists if operator == 'is_present': if row[field_name_idx].replace('nan', '').replace(',', '') != '': ids.add(row[row_id_idx]) # with "is_absent", report variant if it contains only 'nan' values elif operator == 'is_absent': if row[field_name_idx].replace('nan', '').replace(',', '') == '': ids.add(row[row_id_idx]) elif operator == 'greater_than' or operator == 'less_than': # special case: NUM/NUM (which is recognised as string by wormtable) # solution: we check that the ratio NUM/NUM is >,<,= cutoff for value in row[field_name_idx].split(','): if value == '' or value == 'nan': if keep_novalue == 'True': ids.add(row[row_id_idx]) break elif value.find('/') != -1: if operator == 'greater_than': if float(value.split('/')[0]) / float( value.split('/')[1]) > float(cutoff): ids.add(row[row_id_idx]) break elif operator == 'less_than': if float(value.split('/')[0]) / float( value.split('/')[1]) < float(cutoff): ids.add(row[row_id_idx]) break else: sys.stderr.write('\nError: ' + operator + ' incompatible with' + ' field type (string).\n') sys.exit() elif operator == 'equal_to': for value in row[field_name_idx].split(','): if value == '' or value == 'nan': if keep_novalue == 'True': ids.add(row[row_id_idx]) break # special case: NUM/NUM (which is recognised as string by wormtable) # solution: we check that the ratio NUM/NUM is >,<,= cutoff elif value.find('/') != -1: if float(value.split('/')[0]) / float( value.split('/')[1]) == float(cutoff): ids.add(row[row_id_idx]) break elif value == cutoff: ids.add(row[row_id_idx]) break elif operator == 'contains_keyword': for value in row[field_name_idx].split(','): if value == '' or value == 'nan': if keep_novalue == 'True': ids.add(row[row_id_idx]) break for keyword in set(cutoff.split(',')): if value.find(keyword) != -1: ids.add(row[row_id_idx]) break # the type of the field value for the current row is 'tuple' elif isinstance(row[field_name_idx], tuple): # with "is_present", report variant if at least one value exists if operator == 'is_present': if set(map(str, row[field_name_idx])) != set(['nan']): ids.add(row[row_id_idx]) # with "is_absent", report variant if it contains only 'nan' values elif operator == 'is_absent': if set(map(str, row[field_name_idx])) == set(['nan']): ids.add(row[row_id_idx]) elif operator == 'greater_than': for value in row[field_name_idx]: if math.isnan(value): if keep_novalue == 'True': ids.add(row[row_id_idx]) break elif value > float(cutoff): ids.add(row[row_id_idx]) break elif operator == 'less_than': for value in row[field_name_idx]: if math.isnan(value): if keep_novalue == 'True': ids.add(row[row_id_idx]) break elif value < float(cutoff): ids.add(row[row_id_idx]) break elif operator == 'equal_to': for value in row[field_name_idx]: if math.isnan(value): if keep_novalue == 'True': ids.add(row[row_id_idx]) break elif value == float(cutoff): ids.add(row[row_id_idx]) break elif operator == 'contains_keyword': for value in row[field_name_idx]: if math.isnan(value): if keep_novalue == 'True': ids.add(row[row_id_idx]) break for keyword in set(cutoff.split(',')): try: if value.find(keyword) != -1: ids.add(row[row_id_idx]) break except AttributeError: sys.stderr.write('\nError: ' + operator + ' incompatible with' + ' field type (tuple).\n') raise # the type of the field value for the current row is 'int' or 'float' # this includes cases of string numbers (e.g. '1234') elif is_number(row[field_name_idx]): if operator == 'is_present': ids.add(row[row_id_idx]) elif operator == 'greater_than': if row[field_name_idx] > float(cutoff): ids.add(row[row_id_idx]) elif operator == 'less_than': if row[field_name_idx] < float(cutoff): ids.add(row[row_id_idx]) elif operator == 'equal_to': if row[field_name_idx] == float(cutoff): ids.add(row[row_id_idx]) elif operator == 'contains_keyword': for keyword in set(cutoff.split(',')): try: if row[field_name_idx].find(keyword) != -1: ids.add(row[row_id_idx]) break except AttributeError: sys.stderr.write('\nError: ' + operator + ' incompatible with' + ' field type (number).\n') raise # close table table.close() return ids
def setUp(self): super(WtadminTest, self).setUp() vcf2wt.main([EXAMPLE_VCF, self._homedir, "-fq"]) self._table = wt.open_table(self._homedir)
def setUp(self): super(WtadminTest, self).setUp() wt.vcf2wt_main([EXAMPLE_VCF, self._homedir, "-fq"]) self._table = wt.open_table(self._homedir)
def __init__(self, homedir, chrs, cols, wsize=10000, db_cache_size="256M"): self.__table = wt.open_table(homedir, db_cache_size=db_cache_size) self.__index = self.__table.open_index("CHROM+POS", db_cache_size=db_cache_size) self.__wsize = wsize self.__chrs = chrs self.__cols = cols