Beispiel #1
0
 def _test_stdin_input(self, input_file):
     to = os.path.join(self._homedir, "original")
     ts = os.path.join(self._homedir, "stdin")
     self.run_command([input_file, to, "-q"])
     with open(input_file, "rb") as f:
         s = self.run_command(["-", ts], stdin=f)
     with wt.open_table(to) as t1:
         with wt.open_table(ts) as t2:
             self.assert_tables_equal(t1, t2)
     shutil.rmtree(self._homedir)
     os.mkdir(self._homedir)
Beispiel #2
0
 def _test_stdin_input(self, input_file):
     to = os.path.join(self._homedir, "original")
     ts = os.path.join(self._homedir, "stdin")
     self.run_command([input_file, to, "-q"])
     with open(input_file, "rb") as f:
         s = self.run_command(["-", ts], stdin=f)
     with wt.open_table(to) as t1:
         with wt.open_table(ts) as t2:
             self.assert_tables_equal(t1, t2)
     shutil.rmtree(self._homedir)
     os.mkdir(self._homedir)
Beispiel #3
0
 def _test_gzipped_input(self, input_file):
     original = os.path.join(self._homedir, "original")
     zipped = os.path.join(self._homedir, "zipped")
     self.run_command([input_file, original, "-q"])
     zgtf = os.path.join(self._homedir, "gtf.gz")
     z = gzip.open(zgtf, "wb")
     with open(input_file, "rb") as f:
         z.write(f.read())
     z.close()
     self.run_command([zgtf, zipped, "-qf"])
     with wt.open_table(original) as t1:
         with wt.open_table(zipped) as t2:
             self.assert_tables_equal(t1, t2)
     shutil.rmtree(self._homedir)
     os.mkdir(self._homedir)
Beispiel #4
0
 def _test_gzipped_input(self, input_file):
     original = os.path.join(self._homedir, "original")
     zipped = os.path.join(self._homedir, "zipped")
     self.run_command([input_file, original, "-q"])
     zgtf = os.path.join(self._homedir, "gtf.gz")
     z = gzip.open(zgtf, "wb")
     with open(input_file, "rb") as f:
         z.write(f.read())
     z.close()
     self.run_command([zgtf, zipped, "-qf"])
     with wt.open_table(original) as t1:
         with wt.open_table(zipped) as t2:
             self.assert_tables_equal(t1, t2)
     shutil.rmtree(self._homedir)
     os.mkdir(self._homedir)
Beispiel #5
0
def hq_snps_bygt(homedir, sample, gt, minq, cols):
    t =  wt.open_table(homedir)
    i = t.open_index("{0}.GT+QUAL[1]".format(sample))
    start = (gt, minq)
    stop = (gt, i.max_key(gt)[1] + 1)
    for row in i.cursor(cols, start=start, stop=stop):
        print("\t".join([str(i) for i in row]))
Beispiel #6
0
def load_wt_file( wt_file, individuals=[] ):
    """ loads wormtable file
    """
    genotypes = [ind+'.GT' for ind in individuals ]
    table = wormtable.open_table( wt_file )
    tc = table.cursor( genotypes )
    return tc, individuals
Beispiel #7
0
def main():
    parser = argparse.ArgumentParser(description=globals()['__doc__'])

    parser.add_argument('cols',
                        default="CHROM,POS",
                        help='comma separated column names to print')

    parser.add_argument(
        '-i',
        default='i',
        choices=['i', 'e', 'f'],
        help='indel mode: i=include, e=exclude, f=find [default=i]')

    parser.add_argument(
        '-f',
        help=
        'specify semicolon separated filters as COLUMN(>=|<=|>|<|==|!=)VALUE,\
              e.g. "QUAL>20;SAMPLE.GT==0/0"')

    parser.add_argument(
        '-r', help='region, e.g. 1:300-500 (start and end inclusive)')

    parser.add_argument('homedir', help='home directory of database')

    args = vars(parser.parse_args())

    with wt.open_table(args['homedir']) as t, t.open_index("CHROM+POS") as i:
        for row in snp_filter(t, i, args):
            print('\t'.join([str(x) for x in row]))
Beispiel #8
0
def retrieve_variants_by_rowid(inp_folder, ids, out_file):
    """
  Use the row IDs in ids to query the complete wormtable (containing all variant
  fields) and return all the information about the filtered variants.
  """

    # open table and load indices
    table = wt.open_table(inp_folder + '/schema.wt', db_cache_size='4G')
    index = table.open_index('row_id')
    # retrieve the rows using the 'row_id' field and write the results in out_file
    col_names = [col.get_name() for col in table.columns()]
    row_id_idx = col_names.index('row_id')
    out = open(out_file, 'w')
    out.write('\t'.join(col_names) + '\n')
    for row in index.cursor(col_names):
        if row[row_id_idx] in ids:
            to_write = list()
            for value in row:
                try:  # value is a number (int or float)
                    to_write.append(int(value))
                except TypeError, e:  # value is a tuple
                    if value is not None:
                        to_write.append(','.join([str(x) for x in value]))
                    else:
                        to_write.append(None)
                except ValueError, e:  # value is a string
                    to_write.append(value)
                except:
def retrieve_variants_by_rowid(inp_folder, ids, out_file):
  """
  Use the row IDs in ids to query the complete wormtable (containing all variant
  fields) and return all the information about the filtered variants.
  """

  # open table and load indices
  table = wt.open_table(inp_folder + '/schema.wt', db_cache_size='4G')
  index = table.open_index('row_id')
  # retrieve the rows using the 'row_id' field and write the results in out_file
  col_names = [col.get_name() for col in table.columns()]
  row_id_idx = col_names.index('row_id')
  out = open(out_file, 'w')
  out.write('\t'.join(col_names) + '\n')
  for row in index.cursor(col_names):
    if row[row_id_idx] in ids:
      to_write = list()
      for value in row:
        try:  # value is a number (int or float)
          to_write.append(int(value))
        except TypeError, e:  # value is a tuple
          if value is not None:
            to_write.append(','.join([str(x) for x in value]))
          else:
            to_write.append(None)
        except ValueError, e:  # value is a string
          to_write.append(value)
        except:
def get_variants_assoc_to_gene_set(inp_folder, genes, field_name,
  negative_query):
  """
  Open the field_name wormtable (assumed to be named 'inp_folder/field_name.wt')
  within inp_folder and return a set of all row IDs where at least one gene from
  genes is found.
  If negative_query is True, only variants NOT containing any of the input genes
  in field_name will be returned; if False, viceversa (positive query is run).
  """

  # open wormtable for the field of interest
  table = wt.open_table(inp_folder + '/' + field_name + '.wt',
          db_cache_size='4G')
  all_ids = set()
  pos_ids = set()
  # NOTE: it assumes the wormtable has only two columns: 'row_id' and field_name
  row_id_idx = 0
  field_name_idx = 1
  for row in table.cursor(['row_id', field_name]):
    all_ids.add(row[row_id_idx])
    for value in row[field_name_idx].split(','):
      for gene in genes:
        if value.find(gene) != -1:
          pos_ids.add(row[row_id_idx])
          break
  # close table
  table.close()
  # if "negative_query" is True, return all row IDs which are not in "pos_ids"
  if negative_query == 'True':
    neg_ids = all_ids - pos_ids
    return neg_ids
  elif negative_query == 'False':
    return pos_ids
def get_total_variant_count(out_folder):
  """
  Get the total (initial) number of variants.
  """

  tbl = wt.open_table(os.path.join(out_folder, 'schema.wt'))
  return len(tbl)
Beispiel #12
0
def hq_snps_bygt(homedir, sample, gt, minq, cols):
    t = wt.open_table(homedir)
    i = t.open_index("{0}.GT+QUAL[1]".format(sample))
    start = (gt, minq)
    stop = (gt, i.max_key(gt)[1] + 1)
    for row in i.cursor(cols, start=start, stop=stop):
        print("\t".join([str(i) for i in row]))
def get_total_variant_count(out_folder):
  """
  Get the total (initial) number of variants.
  """

  tbl = wt.open_table(os.path.join(out_folder, 'schema.wt'))
  return len(tbl)
Beispiel #14
0
def load_wt_file(wt_file, individuals=[]):
    """ loads wormtable file
    """
    genotypes = [ind + '.GT' for ind in individuals]
    table = wormtable.open_table(wt_file)
    tc = table.cursor(genotypes)
    return tc, individuals
Beispiel #15
0
 def __init__(self, variantSetId, wtDir):
     """
     Allocates a new WormtableDataset with the specified variantSetId
     based on the specified wormtable directory.
     """
     self._variantSetId = variantSetId
     self._wtDir = wtDir
     self._table = wt.open_table(wtDir)
     self._chromPosIndex = self._table.open_index("CHROM+POS")
     self._chromIdIndex = self._table.open_index("CHROM+ID")
     self._sampleCols = {}
     self._infoCols = []
     self._firstSamplePosition = -1
     cols = self._table.columns()[self.FILTER_COL + 1:]
     # We build lookup tables for the INFO and sample columns so they can
     # be easily found during conversion. For the sample columns we make
     # a dictionary mapping the sample name to a the list of (name, col)
     # tuples for that sample.
     for c in cols:
         colName = c.get_name()
         if colName.startswith("INFO"):
             s = colName.split(".")[1]
             self._infoCols.append((s, c))
         # We assume the .GT is the first column for each sample
         elif colName.endswith(".GT"):
             s = colName.split(".")[0]
             self._sampleCols[s] = [(c, "GT")]
             if self._firstSamplePosition == -1:
                 self._firstSamplePosition = c.get_position()
         else:
             # This must be a sample specific column
             s = colName.split(".")
             self._sampleCols[s[0]].append((c, s[1]))
def filter_variants(inp_folder, genotype, samples_list):
  """
  Open all wormtables (assumed to be named 'inp_folder/sample_name_GT.wt') in
  inp_folder corresponding to the specified samples and filter or discard
  variants according to the specified genotype. The row_id value of each
  filtered variant is stored in the set ids, which is returned.
  It works also for non-diploid genotypes. Non-informative genotypes (e.g. './.'
  are skipped.
  """

  # sample_ids has sample names as keys and id sets as values
  samples_ids = dict()
  for sample in samples_list:
    # open wormtable for the field of interest
    table = wt.open_table(sample, db_cache_size='4G')
    # retrieve rows matching genotype in samples_list and store their row_id
    ids = set()
    row_id_idx = 0
    sample_idx = 1
    for row in table:
      gen = row[sample_idx].replace('/','').replace('|','')
      # note: gen == len(gen)*gen[0] to check if all the characters in a
      # string are the same is even faster than count()!
      if gen == len(gen)*gen[0] and gen[0] != '.':
        if gen[0] == '0' and genotype == 'homref':
          ids.add(row[row_id_idx])
        elif gen[0] != '0' and genotype == 'homalt':
          ids.add(row[row_id_idx])
      elif gen != len(gen)*gen[0] and genotype == 'het' and gen[0] != '.':
        ids.add(row[row_id_idx])
    # close table and store results
    table.close()
    samples_ids[sample] = ids
  return samples_ids
def filter_variants(inp_folder, genotype, samples_list):
    """
  Open all wormtables (assumed to be named 'inp_folder/sample_name_GT.wt') in
  inp_folder corresponding to the specified samples and filter or discard
  variants according to the specified genotype. The row_id value of each
  filtered variant is stored in the set ids, which is returned.
  It works also for non-diploid genotypes. Non-informative genotypes (e.g. './.'
  are skipped.
  """

    # sample_ids has sample names as keys and id sets as values
    samples_ids = dict()
    for sample in samples_list:
        # open wormtable for the field of interest
        table = wt.open_table(sample, db_cache_size='4G')
        # retrieve rows matching genotype in samples_list and store their row_id
        ids = set()
        row_id_idx = 0
        sample_idx = 1
        for row in table:
            gen = row[sample_idx].replace('/', '').replace('|', '')
            # note: gen == len(gen)*gen[0] to check if all the characters in a
            # string are the same is even faster than count()!
            if gen == len(gen) * gen[0] and gen[0] != '.':
                if gen[0] == '0' and genotype == 'homref':
                    ids.add(row[row_id_idx])
                elif gen[0] != '0' and genotype == 'homalt':
                    ids.add(row[row_id_idx])
            elif gen != len(
                    gen) * gen[0] and genotype == 'het' and gen[0] != '.':
                ids.add(row[row_id_idx])
        # close table and store results
        table.close()
        samples_ids[sample] = ids
    return samples_ids
def get_variants_assoc_to_gene_set(inp_folder, genes, field_name,
                                   negative_query):
    """
  Open the field_name wormtable (assumed to be named 'inp_folder/field_name.wt')
  within inp_folder and return a set of all row IDs where at least one gene from
  genes is found.
  If negative_query is True, only variants NOT containing any of the input genes
  in field_name will be returned; if False, viceversa (positive query is run).
  """

    # open wormtable for the field of interest
    table = wt.open_table(inp_folder + '/' + field_name + '.wt',
                          db_cache_size='4G')
    all_ids = set()
    pos_ids = set()
    # NOTE: it assumes the wormtable has only two columns: 'row_id' and field_name
    row_id_idx = 0
    field_name_idx = 1
    for row in table.cursor(['row_id', field_name]):
        all_ids.add(row[row_id_idx])
        for value in row[field_name_idx].split(','):
            for gene in genes:
                if value.find(gene) != -1:
                    pos_ids.add(row[row_id_idx])
                    break
    # close table
    table.close()
    # if "negative_query" is True, return all row IDs which are not in "pos_ids"
    if negative_query == 'True':
        neg_ids = all_ids - pos_ids
        return neg_ids
    elif negative_query == 'False':
        return pos_ids
def get_variants_of_given_type_from_previous_results(inp_folder, var_type,
  previous_results):
  """
  Open the REF+ALT wormtable (assumed to be named 'inp_folder/REF+ALT.wt')
  within inp_folder and return a set of all row IDs correspoding to var_type.
  Use ids from previous_results as starting point to further filter the data and
  to make it faster.
  """

  # extract row IDs to check from previous_results (which is a file path) and
  # store them in a set; NOTE: it assumes previous_results has a 1-line header,
  # is tab-separated and row_id is the left-most field!
  ids_to_check = set()
  f = open(previous_results)
  header = True
  for line in f:
    if header:
      header = False
    else:
      ids_to_check.add(int(line.split('\t')[0]))
  f.close()
  # open REF+ALT wormtable
  table = wt.open_table(inp_folder + '/REF+ALT.wt', db_cache_size='4G')
  index = table.open_index('row_id')
  # retrieve rows matching 'var_type'
  ids = set()
  # NOTE: it assumes the wormtable has three columns: 'row_id', 'REF', 'ALT'
  row_id_idx = 0
  ref_idx = 1
  alt_idx = 2
  if var_type == 'SNPs':
    for row in index.cursor(['row_id', 'REF', 'ALT']):
      if row[row_id_idx] in ids_to_check:
        for alt in row[alt_idx].split(','):
          if len(row[ref_idx]) == 1 and len(alt) == 1:
            ids.add(row[row_id_idx])
            break
  elif var_type == 'InDels':
    for row in index.cursor(['row_id', 'REF', 'ALT']):
      if row[row_id_idx] in ids_to_check:
        for alt in row[alt_idx].split(','):
          if len(row[ref_idx]) != len(alt):
            ids.add(row[row_id_idx])
            break
  elif var_type == 'MNPs':
    for row in index.cursor(['row_id', 'REF', 'ALT']):
      if row[row_id_idx] in ids_to_check:
        for alt in row[alt_idx].split(','):
          if len(row[ref_idx]) > 1 and len(row[ref_idx]) == len(alt):
            ids.add(row[row_id_idx])
            break
  else:
    sys.stderr.write("\nVariant type not properly defined.\n")
    sys.exit()
  # close table and index
  table.close()
  index.close()
  return ids
Beispiel #20
0
def get_variants_of_given_type_from_previous_results(inp_folder, var_type,
                                                     previous_results):
    """
  Open the REF+ALT wormtable (assumed to be named 'inp_folder/REF+ALT.wt')
  within inp_folder and return a set of all row IDs correspoding to var_type.
  Use ids from previous_results as starting point to further filter the data and
  to make it faster.
  """

    # extract row IDs to check from previous_results (which is a file path) and
    # store them in a set; NOTE: it assumes previous_results has a 1-line header,
    # is tab-separated and row_id is the left-most field!
    ids_to_check = set()
    f = open(previous_results)
    header = True
    for line in f:
        if header:
            header = False
        else:
            ids_to_check.add(int(line.split('\t')[0]))
    f.close()
    # open REF+ALT wormtable
    table = wt.open_table(inp_folder + '/REF+ALT.wt', db_cache_size='4G')
    index = table.open_index('row_id')
    # retrieve rows matching 'var_type'
    ids = set()
    # NOTE: it assumes the wormtable has three columns: 'row_id', 'REF', 'ALT'
    row_id_idx = 0
    ref_idx = 1
    alt_idx = 2
    if var_type == 'SNPs':
        for row in index.cursor(['row_id', 'REF', 'ALT']):
            if row[row_id_idx] in ids_to_check:
                for alt in row[alt_idx].split(','):
                    if len(row[ref_idx]) == 1 and len(alt) == 1:
                        ids.add(row[row_id_idx])
                        break
    elif var_type == 'InDels':
        for row in index.cursor(['row_id', 'REF', 'ALT']):
            if row[row_id_idx] in ids_to_check:
                for alt in row[alt_idx].split(','):
                    if len(row[ref_idx]) != len(alt):
                        ids.add(row[row_id_idx])
                        break
    elif var_type == 'MNPs':
        for row in index.cursor(['row_id', 'REF', 'ALT']):
            if row[row_id_idx] in ids_to_check:
                for alt in row[alt_idx].split(','):
                    if len(row[ref_idx]) > 1 and len(row[ref_idx]) == len(alt):
                        ids.add(row[row_id_idx])
                        break
    else:
        sys.stderr.write("\nVariant type not properly defined.\n")
        sys.exit()
    # close table and index
    table.close()
    index.close()
    return ids
def filter_variants_from_previous_results(inp_folder, genotype, samples_list,
                                          previous_results):
    """
  Open all wormtables (assumed to be named 'inp_folder/sample_name_GT.wt') in
  inp_folder corresponding to the specified samples and filter or discard
  variants according to the specified genotype. The row_id value of each
  filtered variant is stored in the set ids, which is returned. Use ids from
  previous_results as starting point to further filter the data and to make it
  faster.
  It works also for non-diploid genotypes. Non-informative genotypes (e.g. './.'
  are skipped.
  """

    # extract row IDs to check from previous_results (which is a file path) and
    # store them in a set; NOTE: it assumes previous_results has a 1-line header,
    # is tab-separated and row_id is the left-most field!
    ids_to_check = set()
    f = open(previous_results)
    header = True
    for line in f:
        if header:
            header = False
        else:
            ids_to_check.add(int(line.split('\t')[0]))
    f.close()
    # sample_ids has sample names as keys and id sets as values
    samples_ids = dict()
    for sample in samples_list:
        # open wormtable for the field of interest
        table = wt.open_table(sample, db_cache_size='4G')
        index = table.open_index('row_id')
        # retrieve rows matching genotype in samples_list and store their row_id
        ids = set()
        row_id_idx = 0
        sample_idx = 1
        for row in index.cursor(
            ['row_id', os.path.basename(sample).replace('.wt', '')]):
            # only analyse row if row_id is among the ones in ids_to_check
            if row[row_id_idx] in ids_to_check:
                gen = row[sample_idx].replace('/', '').replace('|', '')
                # note: gen == len(gen)*gen[0] to check if all the characters in a
                # string are the same is even faster than count()!
                if gen == len(gen) * gen[0] and gen[0] != '.':
                    if gen[0] == '0' and genotype == 'homref':
                        ids.add(row[row_id_idx])
                    elif gen[0] != '0' and genotype == 'homalt':
                        ids.add(row[row_id_idx])
                elif gen != len(
                        gen) * gen[0] and genotype == 'het' and gen[0] != '.':
                    ids.add(row[row_id_idx])
        # close table and store results
        table.close()
        index.close()
        samples_ids[sample] = ids
    return samples_ids
def filter_variants_from_previous_results(inp_folder, genotype, samples_list,
  previous_results):
  """
  Open all wormtables (assumed to be named 'inp_folder/sample_name_GT.wt') in
  inp_folder corresponding to the specified samples and filter or discard
  variants according to the specified genotype. The row_id value of each
  filtered variant is stored in the set ids, which is returned. Use ids from
  previous_results as starting point to further filter the data and to make it
  faster.
  It works also for non-diploid genotypes. Non-informative genotypes (e.g. './.'
  are skipped.
  """

  # extract row IDs to check from previous_results (which is a file path) and
  # store them in a set; NOTE: it assumes previous_results has a 1-line header,
  # is tab-separated and row_id is the left-most field!
  ids_to_check = set()
  f = open(previous_results)
  header = True
  for line in f:
    if header:
      header = False
    else:
      ids_to_check.add(int(line.split('\t')[0]))
  f.close()
  # sample_ids has sample names as keys and id sets as values
  samples_ids = dict()
  for sample in samples_list:
    # open wormtable for the field of interest
    table = wt.open_table(sample, db_cache_size='4G')
    index = table.open_index('row_id')
    # retrieve rows matching genotype in samples_list and store their row_id
    ids = set()
    row_id_idx = 0
    sample_idx = 1
    for row in index.cursor(['row_id',
    os.path.basename(sample).replace('.wt', '')]):
      # only analyse row if row_id is among the ones in ids_to_check
      if row[row_id_idx] in ids_to_check:
        gen = row[sample_idx].replace('/','').replace('|','')
        # note: gen == len(gen)*gen[0] to check if all the characters in a
        # string are the same is even faster than count()!
        if gen == len(gen)*gen[0] and gen[0] != '.':
          if gen[0] == '0' and genotype == 'homref':
            ids.add(row[row_id_idx])
          elif gen[0] != '0' and genotype == 'homalt':
            ids.add(row[row_id_idx])
        elif gen != len(gen)*gen[0] and genotype == 'het' and gen[0] != '.':
          ids.add(row[row_id_idx])
    # close table and store results
    table.close()
    index.close()
    samples_ids[sample] = ids
  return samples_ids
Beispiel #23
0
 def setUp(self):
     global _wormtableTestFixture
     self._dataDir = _wormtableTestFixture.dataDir
     self._tables = {}
     self._chromIndexes = {}
     self._chromPosIndexes = {}
     for f in os.listdir(self._dataDir):
         t = wt.open_table(os.path.join(self._dataDir, f))
         self._tables[f] = t
         self._chromIndexes[f] = t.open_index("CHROM")
         self._chromPosIndexes[f] = t.open_index("CHROM+POS")
     self._backend = server.WormtableBackend(self._dataDir)
Beispiel #24
0
 def setUp(self):
     global _wormtableTestFixture
     self._dataDir = _wormtableTestFixture.dataDir
     self._tables = {}
     self._chromIndexes = {}
     self._chromPosIndexes = {}
     for relativePath in os.listdir(self._dataDir):
         table = wt.open_table(os.path.join(self._dataDir, relativePath))
         self._tables[relativePath] = table
         self._chromIndexes[relativePath] = table.open_index("CHROM")
         self._chromPosIndexes[relativePath] = table.open_index("CHROM+POS")
     self._backend = backend.Backend(
         self._dataDir, variants.WormtableVariantSet)
def count_Ts_Tv_wtcursor(homedir):
    """
    Count number of transitions and transversions using wormtable and an
    index on CHROM+POS, counting Ts and Tv row by row
    """
    with wt.open_table(homedir) as t:
        Ts, Tv = 0, 0
        for ref, alt in t.cursor(["REF", "ALT"]):
            if ref != alt and ref in bases and alt in bases:
                if bases[ref] == bases[alt]:
                    Ts +=1
                else:
                    Tv +=1
    return Ts, Tv
def count_Ts_Tv_wtindex(homedir):
    """
    Count number of of transitions and transversions using wormtable and
    an index on REF+ALT
    """
    with wt.open_table(homedir) as t, t.open_index("REF+ALT") as i:
        Ts, Tv = 0, 0
        c = i.counter()
        for s in permutations(bases.keys(), 2):
            if bases[s[0]] == bases[s[1]]:
                Ts += c[s]
            else:
                Tv += c[s]
    return Ts, Tv
Beispiel #27
0
 def test_open_api(self):
     """
     Tests the open_table/index api to ensure everything works correctly.
     """
     t = wt.Table(self._homedir)
     t.add_id_column()
     t.add_uint_column("u1")
     t.open("w")
     self.assertTrue(t.is_open())
     t.close()
     # open_table returns a table opened
     self.assertFalse(t.is_open())
     t = wt.open_table(self._homedir)
     self.assertTrue(t.is_open())
     t.close()
     self.assertFalse(t.is_open())
     # try now with the context manager
     with wt.open_table(self._homedir) as t:
         self.assertTrue(t.is_open())
     self.assertFalse(t.is_open())
     # Now do the same for an index.
     t = wt.open_table(self._homedir)
     name = "test"
     i = wt.Index(t, name)
     i.add_key_column(t.get_column(1))
     i.open("w")
     i.build()
     i.close()
     # The index is built, so we can open it.
     i = t.open_index(name)
     self.assertTrue(i.is_open())
     i.close()
     self.assertFalse(i.is_open())
     with t.open_index(name) as i:
         self.assertTrue(i.is_open())
     self.assertFalse(i.is_open())
     t.close()
Beispiel #28
0
 def test_open_api(self):
     """
     Tests the open_table/index api to ensure everything works correctly.
     """
     t = wt.Table(self._homedir)
     t.add_id_column()
     t.add_uint_column("u1")
     t.open("w")
     self.assertTrue(t.is_open())
     t.close()
     # open_table returns a table opened
     self.assertFalse(t.is_open())
     t = wt.open_table(self._homedir)
     self.assertTrue(t.is_open())
     t.close()
     self.assertFalse(t.is_open())
     # try now with the context manager
     with wt.open_table(self._homedir) as t:
         self.assertTrue(t.is_open())
     self.assertFalse(t.is_open())
     # Now do the same for an index.
     t = wt.open_table(self._homedir)
     name = "test"
     i = wt.Index(t, name)
     i.add_key_column(t.get_column(1))
     i.open("w")
     i.build()
     i.close()
     # The index is built, so we can open it.
     i = t.open_index(name)
     self.assertTrue(i.is_open())
     i.close()
     self.assertFalse(i.is_open())
     with t.open_index(name) as i:
         self.assertTrue(i.is_open())
     self.assertFalse(i.is_open())
     t.close()
def get_variants_assoc_to_gene_set_from_previous_results(inp_folder, genes,
  field_name, negative_query, previous_results):
  """
  Open the field_name wormtable (assumed to be named 'inp_folder/field_name.wt')
  within inp_folder and return a set of all row IDs where at least one gene from
  genes is found. Use ids from previous_results as starting point to further
  filter the data and to make it faster.
  If negative_query is True, only variants NOT containing any of the input genes
  in field_name will be returned; if False, viceversa (positive query is run).
  """

  # extract row IDs to check from previous_results (which is a file path) and
  # store them in a set; NOTE: it assumes previous_results has a 1-line header,
  # is tab-separated and row_id is the left-most field!
  ids_to_check = set()
  f = open(previous_results)
  header = True
  for line in f:
    if header:
      header = False
    else:
      ids_to_check.add(int(line.split('\t')[0]))
  f.close()
  # open wormtable for the field of interest
  table = wt.open_table(inp_folder + '/' + field_name + '.wt',
          db_cache_size='4G')
  index = table.open_index('row_id')
  all_ids = set()
  pos_ids = set()
  # NOTE: it assumes the wormtable has only two columns: 'row_id' and field_name
  row_id_idx = 0
  field_name_idx = 1
  for row in index.cursor(['row_id', field_name]):
    if row[row_id_idx] in ids_to_check:
      all_ids.add(row[row_id_idx])
      for value in row[field_name_idx].split(','):
        for gene in genes:
          if value.find(gene) != -1:
            pos_ids.add(row[row_id_idx])
            break
  # close table and index
  table.close()
  index.close()
  # if "negative_query" is True, return all row IDs which are not in "pos_ids"
  if negative_query == 'True':
    neg_ids = all_ids - pos_ids
    return neg_ids
  elif negative_query == 'False':
    return pos_ids
def get_variants_assoc_to_gene_set_from_previous_results(
        inp_folder, genes, field_name, negative_query, previous_results):
    """
  Open the field_name wormtable (assumed to be named 'inp_folder/field_name.wt')
  within inp_folder and return a set of all row IDs where at least one gene from
  genes is found. Use ids from previous_results as starting point to further
  filter the data and to make it faster.
  If negative_query is True, only variants NOT containing any of the input genes
  in field_name will be returned; if False, viceversa (positive query is run).
  """

    # extract row IDs to check from previous_results (which is a file path) and
    # store them in a set; NOTE: it assumes previous_results has a 1-line header,
    # is tab-separated and row_id is the left-most field!
    ids_to_check = set()
    f = open(previous_results)
    header = True
    for line in f:
        if header:
            header = False
        else:
            ids_to_check.add(int(line.split('\t')[0]))
    f.close()
    # open wormtable for the field of interest
    table = wt.open_table(inp_folder + '/' + field_name + '.wt',
                          db_cache_size='4G')
    index = table.open_index('row_id')
    all_ids = set()
    pos_ids = set()
    # NOTE: it assumes the wormtable has only two columns: 'row_id' and field_name
    row_id_idx = 0
    field_name_idx = 1
    for row in index.cursor(['row_id', field_name]):
        if row[row_id_idx] in ids_to_check:
            all_ids.add(row[row_id_idx])
            for value in row[field_name_idx].split(','):
                for gene in genes:
                    if value.find(gene) != -1:
                        pos_ids.add(row[row_id_idx])
                        break
    # close table and index
    table.close()
    index.close()
    # if "negative_query" is True, return all row IDs which are not in "pos_ids"
    if negative_query == 'True':
        neg_ids = all_ids - pos_ids
        return neg_ids
    elif negative_query == 'False':
        return pos_ids
Beispiel #31
0
 def setUp(self):
     super(VcfBuildTest, self).setUp()
     vcf = self.get_vcf()
     self.run_command([vcf, self._homedir, "-qf"])
     self._table = wt.open_table(self._homedir)
     # get some simple information about the VCF
     self._num_rows = 0
     self._info_cols = 0
     f = open(vcf, "r")
     for l in f:
         if l.startswith("#"):
             if l.startswith("##INFO"):
                 self._info_cols += 1
         else:
             self._num_rows += 1
Beispiel #32
0
 def setUp(self):
     super(VcfBuildTest, self).setUp()
     vcf = self.get_vcf()
     self.run_command([vcf, self._homedir, "-qf"])
     self._table = wt.open_table(self._homedir)
     # get some simple information about the VCF
     self._num_rows = 0
     self._info_cols = 0
     f = open(vcf, "r")
     for l in f:
         if l.startswith("#"):
             if l.startswith("##INFO"):
                 self._info_cols += 1
         else:
             self._num_rows += 1
Beispiel #33
0
def count_Ts_Tv(homedir):
    """ 
    Count number of of transitions and transversions using an index on REF+ALT
    """
    subs = [p for p in permutations([b'A',b'C',b'G',b'T'], 2)]
    bases = {b'A':'purine', b'G':'purine', b'C':'pyrimidine', b'T':'pyrimidine'}
    t = wt.open_table(homedir)
    i = t.open_index("REF+ALT")
    Ts, Tv = 0, 0
    c = i.counter()
    for s in subs:
        if bases[s[0]] == bases[s[1]]: 
            Ts += c[s] 
        else: 
            Tv += c[s] 
    i.close()
    t.close()
    return Ts, Tv
Beispiel #34
0
 def setUp(self):
     super(GtfBuildTest, self).setUp()
     gtf = self.get_gtf()
     self.run_command([gtf, self._homedir, "-qf"])
     self._table = wt.open_table(self._homedir)
     self._columns = [
         SEQNAME, SOURCE, FEATURE, START, END, SCORE, STRAND, FRAME,
         GENE_ID, TRANSCRIPT_ID
     ]
     # parse the file
     self._num_rows = 0
     f = open(gtf, "r")
     self._rows = []
     for l in f:
         row = {}
         tokens = l.split("\t")
         row[SEQNAME] = tokens[0]
         row[SOURCE] = tokens[1]
         row[FEATURE] = tokens[2]
         row[START] = int(tokens[3])
         row[END] = int(tokens[4])
         tok = tokens[5]
         row[SCORE] = float(tok) if tok != "." else None
         row[STRAND] = tokens[6]
         tok = tokens[7]
         row[FRAME] = int(tok) if tok != "." else None
         attrs = tokens[8].split(";")
         d = {}
         for s in attrs:
             spl = s.split()
             if len(spl) > 0:
                 d[spl[0]] = spl[1].strip("\"")
         row[GENE_ID] = d[GENE_ID]
         row[TRANSCRIPT_ID] = d[TRANSCRIPT_ID]
         # if the type is str, encode it.
         r = {}
         for k, v in row.items():
             r[k] = v
             if isinstance(v, str):
                 r[k] = v.encode()
         self._rows.append(r)
         self._num_rows += 1
Beispiel #35
0
 def setUp(self):
     super(GtfBuildTest, self).setUp()
     gtf = self.get_gtf()
     self.run_command([gtf, self._homedir, "-qf"])
     self._table = wt.open_table(self._homedir)
     self._columns = [SEQNAME, SOURCE, FEATURE, START, END, SCORE, STRAND,
             FRAME, GENE_ID, TRANSCRIPT_ID]
     # parse the file
     self._num_rows = 0
     f = open(gtf, "r")
     self._rows = []
     for l in f:
         row = {}
         tokens = l.split("\t")
         row[SEQNAME] = tokens[0]
         row[SOURCE] = tokens[1]
         row[FEATURE] = tokens[2]
         row[START] = int(tokens[3])
         row[END] = int(tokens[4])
         tok = tokens[5]
         row[SCORE] = float(tok) if tok != "." else None
         row[STRAND] = tokens[6]
         tok = tokens[7]
         row[FRAME] = int(tok) if tok != "." else None
         attrs = tokens[8].split(";")
         d = {}
         for s in attrs:
            spl = s.split()
            if len(spl) > 0:
                d[spl[0]] = spl[1].strip("\"")
         row[GENE_ID] = d[GENE_ID]
         row[TRANSCRIPT_ID] = d[TRANSCRIPT_ID]
         # if the type is str, encode it.
         r = {}
         for k, v in row.items():
             r[k] = v
             if isinstance(v, str):
                 r[k] = v.encode()
         self._rows.append(r)
         self._num_rows += 1
def get_variants_in_given_regions(inp_folder, chrom, start_pos, end_pos):
  """
  Open the CHROM+POS wormtable (assumed to be named 'inp_folder/CHROM+POS.wt')
  within inp_folder and return a set of all row IDs correspoding to the region
  of interest.
  """

  # open CHROM+POS wormtable
  table = wt.open_table(inp_folder + '/CHROM+POS.wt', db_cache_size='4G')
  index = table.open_index('CHROM+POS')
  # retrieve rows matching 'chrom' and whose pos. is between 'start' and 'end'
  ids = set()
  # NOTE: it assumes the wormtable has three columns: 'row_id', 'CHROM', 'POS'
  row_id_idx = 0
  cols = ['row_id', 'CHROM', 'POS']
  for row in index.cursor(cols, start=(chrom, start_pos),
  stop=(chrom, end_pos)):
    ids.add(row[row_id_idx])
  # close table and index
  table.close()
  index.close()
  return ids
Beispiel #37
0
def get_variants_of_given_type(inp_folder, var_type):
    """
  Open the REF+ALT wormtable (assumed to be named 'inp_folder/REF+ALT.wt')
  within inp_folder and return a set of all row IDs correspoding to var_type.
  """

    # open REF+ALT wormtable
    table = wt.open_table(inp_folder + '/REF+ALT.wt', db_cache_size='4G')
    # retrieve rows matching 'var_type'
    ids = set()
    # NOTE: it assumes the wormtable has three columns: 'row_id', 'REF', 'ALT'
    row_id_idx = 0
    ref_idx = 1
    alt_idx = 2
    if var_type == 'SNPs':
        for row in table.cursor(['row_id', 'REF', 'ALT']):
            for alt in row[alt_idx].split(','):
                if len(row[ref_idx]) == 1 and len(alt) == 1:
                    ids.add(row[row_id_idx])
                    break
    elif var_type == 'InDels':
        for row in table.cursor(['row_id', 'REF', 'ALT']):
            for alt in row[alt_idx].split(','):
                if len(row[ref_idx]) != len(alt):
                    ids.add(row[row_id_idx])
                    break
    elif var_type == 'MNPs':
        for row in table.cursor(['row_id', 'REF', 'ALT']):
            for alt in row[alt_idx].split(','):
                if len(row[ref_idx]) > 1 and len(row[ref_idx]) == len(alt):
                    ids.add(row[row_id_idx])
                    break
    else:
        sys.stderr.write("\nVariant type not properly defined.\n")
        sys.exit()
    # close table
    table.close()
    return ids
def get_variants_of_given_type(inp_folder, var_type):
  """
  Open the REF+ALT wormtable (assumed to be named 'inp_folder/REF+ALT.wt')
  within inp_folder and return a set of all row IDs correspoding to var_type.
  """

  # open REF+ALT wormtable
  table = wt.open_table(inp_folder + '/REF+ALT.wt', db_cache_size='4G')
  # retrieve rows matching 'var_type'
  ids = set()
  # NOTE: it assumes the wormtable has three columns: 'row_id', 'REF', 'ALT'
  row_id_idx = 0
  ref_idx = 1
  alt_idx = 2
  if var_type == 'SNPs':
    for row in table.cursor(['row_id', 'REF', 'ALT']):
      for alt in row[alt_idx].split(','):
        if len(row[ref_idx]) == 1 and len(alt) == 1:
          ids.add(row[row_id_idx])
          break
  elif var_type == 'InDels':
    for row in table.cursor(['row_id', 'REF', 'ALT']):
      for alt in row[alt_idx].split(','):
        if len(row[ref_idx]) != len(alt):
          ids.add(row[row_id_idx])
          break
  elif var_type == 'MNPs':
    for row in table.cursor(['row_id', 'REF', 'ALT']):
      for alt in row[alt_idx].split(','):
        if len(row[ref_idx]) > 1 and len(row[ref_idx]) == len(alt):
          ids.add(row[row_id_idx])
          break
  else:
    sys.stderr.write("\nVariant type not properly defined.\n")
    sys.exit()
  # close table
  table.close()
  return ids
Beispiel #39
0
def get_variants_in_given_regions_from_previous_results(
        inp_folder, chrom, start_pos, end_pos, previous_results):
    """
  Open the CHROM+POS wormtable (assumed to be named 'inp_folder/CHROM+POS.wt')
  within inp_folder and return a set of all row IDs correspoding to the region
  of interest. Use ids from previous_results as starting point to further filter
  the data and to make it faster.
  """

    # extract row IDs to check from previous_results (which is a file path) and
    # store them in a set; NOTE: it assumes previous_results has a 1-line header,
    # is tab-separated and row_id is the left-most field!
    ids_to_check = set()
    f = open(previous_results)
    header = True
    for line in f:
        if header:
            header = False
        else:
            ids_to_check.add(int(line.split('\t')[0]))
    f.close()
    # open CHROM+POS wormtable
    table = wt.open_table(inp_folder + '/CHROM+POS.wt', db_cache_size='4G')
    index = table.open_index('CHROM+POS')
    # retrieve rows matching 'chrom' and whose pos. is between 'start' and 'end'
    ids = set()
    # NOTE: it assumes the wormtable has three columns: 'row_id', 'CHROM', 'POS'
    row_id_idx = 0
    cols = ['row_id', 'CHROM', 'POS']
    for row in index.cursor(cols,
                            start=(chrom, start_pos),
                            stop=(chrom, end_pos)):
        if row[row_id_idx] in ids_to_check:
            ids.add(row[row_id_idx])
    # close table and index
    table.close()
    index.close()
    return ids
Beispiel #40
0
def count_Ts_Tv(homedir):
    """ 
    Count number of of transitions and transversions using an index on REF+ALT
    """
    subs = [p for p in permutations([b'A', b'C', b'G', b'T'], 2)]
    bases = {
        b'A': 'purine',
        b'G': 'purine',
        b'C': 'pyrimidine',
        b'T': 'pyrimidine'
    }
    t = wt.open_table(homedir)
    i = t.open_index("REF+ALT")
    Ts, Tv = 0, 0
    c = i.counter()
    for s in subs:
        if bases[s[0]] == bases[s[1]]:
            Ts += c[s]
        else:
            Tv += c[s]
    i.close()
    t.close()
    return Ts, Tv
Beispiel #41
0
def get_variants_in_given_regions(inp_folder, chrom, start_pos, end_pos):
    """
  Open the CHROM+POS wormtable (assumed to be named 'inp_folder/CHROM+POS.wt')
  within inp_folder and return a set of all row IDs correspoding to the region
  of interest.
  """

    # open CHROM+POS wormtable
    table = wt.open_table(inp_folder + '/CHROM+POS.wt', db_cache_size='4G')
    index = table.open_index('CHROM+POS')
    # retrieve rows matching 'chrom' and whose pos. is between 'start' and 'end'
    ids = set()
    # NOTE: it assumes the wormtable has three columns: 'row_id', 'CHROM', 'POS'
    row_id_idx = 0
    cols = ['row_id', 'CHROM', 'POS']
    for row in index.cursor(cols,
                            start=(chrom, start_pos),
                            stop=(chrom, end_pos)):
        ids.add(row[row_id_idx])
    # close table and index
    table.close()
    index.close()
    return ids
Beispiel #42
0
 def __init__(self, variantSetId, wtDir):
     """
     Allocates a new WormtableVariantSet with the specified variantSetId
     based on the specified wormtable directory.
     """
     self._variantSetId = variantSetId
     self._wtDir = wtDir
     self._table = wt.open_table(wtDir)
     self._chromPosIndex = self._table.open_index("CHROM+POS")
     self._chromIdIndex = self._table.open_index("CHROM+ID")
     self._sampleCols = {}
     self._sampleNames = []
     self._infoCols = []
     self._firstSamplePosition = -1
     ctimeInMillis = int(os.path.getctime(wtDir) * 1000)
     # ctime is in seconds, and we want milliseconds since the epoch
     self._creationTime = ctimeInMillis
     self._updatedTime = ctimeInMillis
     cols = self._table.columns()[self.FILTER_COL + 1:]
     # We build lookup tables for the INFO and sample columns so they can
     # be easily found during conversion. For the sample columns we make
     # a dictionary mapping the sample name to a list of (sample name, col)
     # tuples for that sample.
     for col in cols:
         colName = col.get_name()
         if colName.startswith("INFO"):
             infoField = colName.split(".")[1]
             self._infoCols.append((infoField, col))
         else:
             if self._firstSamplePosition == -1:
                 # This must be a sample specific column
                 self._firstSamplePosition = col.get_position()
             sampleName, infoName = colName.split(".")
             if sampleName not in self._sampleCols:
                 self._sampleCols[sampleName] = []
                 self._sampleNames.append(sampleName)
             self._sampleCols[sampleName].append((infoName, col))
def get_variants_in_given_regions_from_previous_results(inp_folder, chrom,
    start_pos, end_pos, previous_results):
  """
  Open the CHROM+POS wormtable (assumed to be named 'inp_folder/CHROM+POS.wt')
  within inp_folder and return a set of all row IDs correspoding to the region
  of interest. Use ids from previous_results as starting point to further filter
  the data and to make it faster.
  """

  # extract row IDs to check from previous_results (which is a file path) and
  # store them in a set; NOTE: it assumes previous_results has a 1-line header,
  # is tab-separated and row_id is the left-most field!
  ids_to_check = set()
  f = open(previous_results)
  header = True
  for line in f:
    if header:
      header = False
    else:
      ids_to_check.add(int(line.split('\t')[0]))
  f.close()
  # open CHROM+POS wormtable
  table = wt.open_table(inp_folder + '/CHROM+POS.wt', db_cache_size='4G')
  index = table.open_index('CHROM+POS')
  # retrieve rows matching 'chrom' and whose pos. is between 'start' and 'end'
  ids = set()
  # NOTE: it assumes the wormtable has three columns: 'row_id', 'CHROM', 'POS'
  row_id_idx = 0
  cols = ['row_id', 'CHROM', 'POS']
  for row in index.cursor(cols, start=(chrom, start_pos),
  stop=(chrom, end_pos)):
    if row[row_id_idx] in ids_to_check:
      ids.add(row[row_id_idx])
  # close table and index
  table.close()
  index.close()
  return ids
Beispiel #44
0
 def __init__(self, variantSetId, wtDir):
     """
     Allocates a new WormtableDataset with the specified variantSetId
     based on the specified wormtable directory.
     """
     self._variantSetId = variantSetId
     self._wtDir = wtDir
     self._table = wt.open_table(wtDir)
     self._chromPosIndex = self._table.open_index("CHROM+POS")
     self._chromIdIndex = self._table.open_index("CHROM+ID")
     self._sampleCols = {}
     self._sampleNames = []
     self._infoCols = []
     self._firstSamplePosition = -1
     t = int(os.path.getctime(wtDir) * 1000)
     # ctime is in seconds, and we want milliseconds since the epoch
     self._creationTime = t
     self._updatedTime = t
     cols = self._table.columns()[self.FILTER_COL + 1:]
     # We build lookup tables for the INFO and sample columns so they can
     # be easily found during conversion. For the sample columns we make
     # a dictionary mapping the sample name to a the list of (name, col)
     # tuples for that sample.
     for c in cols:
         colName = c.get_name()
         if colName.startswith("INFO"):
             s = colName.split(".")[1]
             self._infoCols.append((s, c))
         else:
             if self._firstSamplePosition == -1:
                 # This must be a sample specific column
                 self._firstSamplePosition = c.get_position()
             sampleName, infoName = colName.split(".")
             if sampleName not in self._sampleCols:
                 self._sampleCols[sampleName] = []
                 self._sampleNames.append(sampleName)
             self._sampleCols[sampleName].append((infoName, c))
Beispiel #45
0
def main():
    parser = argparse.ArgumentParser(description=globals()['__doc__'])
    
    parser.add_argument('cols', default="CHROM,POS",
        help='comma separated column names to print')

    parser.add_argument('-i', default='i', choices=['i','e','f'],
        help='indel mode: i=include, e=exclude, f=find [default=i]')

    parser.add_argument('-f',
        help='specify semicolon separated filters as COLUMN(>=|<=|>|<|==|!=)VALUE,\
              e.g. "QUAL>20;SAMPLE.GT==0/0"')

    parser.add_argument('-r',
        help='region, e.g. 1:300-500 (start and end inclusive)')

    parser.add_argument('homedir',
        help='home directory of database')
        
    args = vars(parser.parse_args())
        
    with wt.open_table(args['homedir']) as t, t.open_index("CHROM+POS") as i:
        for row in snp_filter(t, i, args):
            print('\t'.join([str(x) for x in row]))
Beispiel #46
0
def hq_snps(homedir, minq, cols):
    with wt.open_table(homedir) as t, t.open_index("QUAL[1]") as i:
        cursor = t.cursor(cols, i)
        cursor.set_min(minq)
        for row in cursor:
            yield row 
def filter_variants_from_previous_results(inp_folder, field_name, operator,
  cutoff, keep_novalue, previous_results):
  """
  Open wormtable within inp_folder corresponding to field_name (assumed to be
  named 'inp_folder/field_name.wt') and filter or discard variants according to
  the specified cutoff. The row_id value of each filtered variant is stored in
  the set ids, which is returned. Use ids from previous_results as starting
  point to further filter the data and to make it faster.
  """

  # extract row IDs to check from previous_results (which is a file path) and
  # store them in a set; NOTE: it assumes previous_results has a 1-line header,
  # is tab-separated and row_id is the left-most field!
  ids_to_check = set()
  f = open(previous_results)
  header = True
  for line in f:
    if header:
      header = False
    else:
      ids_to_check.add(int(line.split('\t')[0]))
  f.close()
  # open wormtable for the field of interest
  table = wt.open_table(inp_folder + '/' + field_name + '.wt',
                        db_cache_size='4G')
  index = table.open_index('row_id')
  # retrieve rows passing the cutoff for field_name and store their row_id
  ids = set()
  # NOTE: it assumes the wormtable has only two columns: 'row_id' and field_name
  row_id_idx = 0
  field_name_idx = 1
  for row in index.cursor(['row_id', field_name]):
    # only analyse row if row_id is among the ones in ids_to_check
    if row[row_id_idx] in ids_to_check:
      # the type of the field value for the current row is 'NoneType', empty,
      # or 'nan'
      if row[field_name_idx] is None or row[field_name_idx] == '':
        if keep_novalue == 'True':
          ids.add(row[row_id_idx])
        else:
          pass
      # the type of the field value for the current row is 'str'
      elif isinstance(row[field_name_idx], str):
        if operator == 'greater_than' or operator == 'less_than':
          # special case: NUM/NUM (which is recognised as string by wormtable)
          # solution: we check that the ratio NUM/NUM is >,<,= cutoff
          for value in row[field_name_idx].split(','):
            if value == '' or value == 'nan':
              if keep_novalue == 'True':
                ids.add(row[row_id_idx])
                break
            elif value.find('/') != -1:
              if operator == 'greater_than':
                if float(value.split('/')[0])/float(value.split('/')[1]) > float(cutoff):
                  ids.add(row[row_id_idx])
                  break
              elif operator == 'less_than':
                if float(value.split('/')[0])/float(value.split('/')[1]) < float(cutoff):
                  ids.add(row[row_id_idx])
                  break
            else:
              sys.stderr.write('\nError: ' + operator + ' incompatible with' +
                               ' field type (string).\n')
              sys.exit()
        elif operator == 'equal_to':
          for value in row[field_name_idx].split(','):
            if value == '' or value == 'nan':
              if keep_novalue == 'True':
                ids.add(row[row_id_idx])
                break
            # special case: NUM/NUM (which is recognised as string by wormtable)
            # solution: we check that the ratio NUM/NUM is >,<,= cutoff
            elif value.find('/') != -1:
              if float(value.split('/')[0])/float(value.split('/')[1]) == float(cutoff):
                ids.add(row[row_id_idx])
                break
            elif value == cutoff:
              ids.add(row[row_id_idx])
              break
        elif operator == 'contains_keyword':
          for value in row[field_name_idx].split(','):
            if value == '' or value == 'nan':
              if keep_novalue == 'True':
                ids.add(row[row_id_idx])
                break
            for keyword in set(cutoff.split(',')):
              if value.find(keyword) != -1:
                ids.add(row[row_id_idx])
                break
      # the type of the field value for the current row is 'tuple'
      elif isinstance(row[field_name_idx], tuple):
        if operator == 'greater_than':
          for value in row[field_name_idx]:
            if math.isnan(value):
              if keep_novalue == 'True':
                ids.add(row[row_id_idx])
                break
            elif value > float(cutoff):
              ids.add(row[row_id_idx])
              break
        elif operator == 'less_than':
          for value in row[field_name_idx]:
            if math.isnan(value):
              if keep_novalue == 'True':
                ids.add(row[row_id_idx])
                break
            elif value < float(cutoff):
              ids.add(row[row_id_idx])
              break
        elif operator == 'equal_to':
          for value in row[field_name_idx]:
            if math.isnan(value):
              if keep_novalue == 'True':
                ids.add(row[row_id_idx])
                break
            elif value == float(cutoff):
              ids.add(row[row_id_idx])
              break
        elif operator == 'contains_keyword':
          for value in row[field_name_idx]:
            if math.isnan(value):
              if keep_novalue == 'True':
                ids.add(row[row_id_idx])
                break
            for keyword in set(cutoff.split(',')):
              if value.find(keyword) != -1:
                ids.add(row[row_id_idx])
                break
      # the type of the field value for the current row is 'int' or 'float'
      # this includes cases of string numbers (e.g. '1234')
      elif is_number(row[field_name_idx]):
        if math.isnan(row[field_name_idx]):
          if keep_novalue == 'True':
            ids.add(row[row_id_idx])
        elif operator == 'greater_than':
          if row[field_name_idx] > float(cutoff):
            ids.add(row[row_id_idx])
        elif operator == 'less_than':
          if row[field_name_idx] < float(cutoff):
            ids.add(row[row_id_idx])
        elif operator == 'equal_to':
          if row[field_name_idx] == float(cutoff):
            ids.add(row[row_id_idx])
        elif operator == 'contains_keyword':
          for keyword in set(cutoff.split(',')):
            if row[field_name_idx].find(keyword) != -1:
              ids.add(row[row_id_idx])
              break
  # close table and index
  table.close()
  index.close()
  return ids
def filter_variants(inp_folder, field_name, operator, cutoff, keep_novalue):
  """
  Open wormtable within inp_folder corresponding to field_name (assumed to be
  named 'inp_folder/field_name.wt') and filter or discard variants according to
  the specified cutoff. The row_id value of each filtered variant is stored in
  the set ids, which is returned.
  """

  # open wormtable for the field of interest
  table = wt.open_table(inp_folder + '/' + field_name + '.wt',
                        db_cache_size='4G')
  # retrieve rows passing the cutoff for field_name and store their row_id
  ids = set()
  # NOTE: it assumes the wormtable has only two columns: 'row_id' and field_name
  row_id_idx = 0
  field_name_idx = 1
  # NOTE: row is a tuple of row_id and field_name
  for row in table.cursor(['row_id', field_name]):
    # the type of the field value for the current row is 'NoneType', empty,
    # or 'nan'
    if row[field_name_idx] is None or row[field_name_idx] == '':
      if keep_novalue == 'True':
        ids.add(row[row_id_idx])
      else:
        pass
    # the type of the field value for the current row is 'str'
    elif isinstance(row[field_name_idx], str):
      if operator == 'greater_than' or operator == 'less_than':
        # special case: NUM/NUM (which is recognised as string by wormtable)
        # solution: we check that the ratio NUM/NUM is >,<,= cutoff
        for value in row[field_name_idx].split(','):
          if value == '' or value == 'nan':
            if keep_novalue == 'True':
              ids.add(row[row_id_idx])
              break
          elif value.find('/') != -1:
            if operator == 'greater_than':
              if float(value.split('/')[0])/float(value.split('/')[1]) > float(cutoff):
                ids.add(row[row_id_idx])
                break
            elif operator == 'less_than':
              if float(value.split('/')[0])/float(value.split('/')[1]) < float(cutoff):
                ids.add(row[row_id_idx])
                break
          else:
            sys.stderr.write('\nError: ' + operator + ' incompatible with' +
                             ' field type (string).\n')
            sys.exit()
      elif operator == 'equal_to':
        for value in row[field_name_idx].split(','):
          if value == '' or value == 'nan':
            if keep_novalue == 'True':
              ids.add(row[row_id_idx])
              break
          # special case: NUM/NUM (which is recognised as string by wormtable)
          # solution: we check that the ratio NUM/NUM is >,<,= cutoff
          elif value.find('/') != -1:
            if float(value.split('/')[0])/float(value.split('/')[1]) == float(cutoff):
              ids.add(row[row_id_idx])
              break
          elif value == cutoff:
            ids.add(row[row_id_idx])
            break
      elif operator == 'contains_keyword':
        for value in row[field_name_idx].split(','):
          if value == '' or value == 'nan':
            if keep_novalue == 'True':
              ids.add(row[row_id_idx])
              break
          for keyword in set(cutoff.split(',')):
            if value.find(keyword) != -1:
              ids.add(row[row_id_idx])
              break
    # the type of the field value for the current row is 'tuple'
    elif isinstance(row[field_name_idx], tuple):
      if operator == 'greater_than':
        for value in row[field_name_idx]:
          if math.isnan(value):
            if keep_novalue == 'True':
              ids.add(row[row_id_idx])
              break
          elif value > float(cutoff):
            ids.add(row[row_id_idx])
            break
      elif operator == 'less_than':
        for value in row[field_name_idx]:
          if math.isnan(value):
            if keep_novalue == 'True':
              ids.add(row[row_id_idx])
              break
          elif value < float(cutoff):
            ids.add(row[row_id_idx])
            break
      elif operator == 'equal_to':
        for value in row[field_name_idx]:
          if math.isnan(value):
            if keep_novalue == 'True':
              ids.add(row[row_id_idx])
              break
          elif value == float(cutoff):
            ids.add(row[row_id_idx])
            break
      elif operator == 'contains_keyword':
        for value in row[field_name_idx]:
          if math.isnan(value):
            if keep_novalue == 'True':
              ids.add(row[row_id_idx])
              break
          for keyword in set(cutoff.split(',')):
            if value.find(keyword) != -1:
              ids.add(row[row_id_idx])
              break
    # the type of the field value for the current row is 'int' or 'float'
    # this includes cases of string numbers (e.g. '1234')
    elif is_number(row[field_name_idx]):
      if math.isnan(row[field_name_idx]):
        if keep_novalue == 'True':
          ids.add(row[row_id_idx])
      elif operator == 'greater_than':
        if row[field_name_idx] > float(cutoff):
          ids.add(row[row_id_idx])
      elif operator == 'less_than':
        if row[field_name_idx] < float(cutoff):
          ids.add(row[row_id_idx])
      elif operator == 'equal_to':
        if row[field_name_idx] == float(cutoff):
          ids.add(row[row_id_idx])
      elif operator == 'contains_keyword':
        for keyword in set(cutoff.split(',')):
          if row[field_name_idx].find(keyword) != -1:
            ids.add(row[row_id_idx])
            break
  # close table
  table.close()
  return ids
def filter_variants(inp_folder, field_name, operator, cutoff, keep_novalue):
    """
  Open wormtable within inp_folder corresponding to field_name (assumed to be
  named 'inp_folder/field_name.wt') and filter or discard variants according to
  the specified cutoff. The row_id value of each filtered variant is stored in
  the set ids, which is returned.

  NOTE: if operator is 'is_present' or 'is_absent', cutoff and keep_novalue will
        be ignored!
  """

    # open wormtable for the field of interest
    table = wt.open_table(inp_folder + '/' + field_name + '.wt',
                          db_cache_size='4G')
    # retrieve rows passing the cutoff for field_name and store their row_id
    ids = set()
    # NOTE: it assumes the wormtable has only two columns: 'row_id' and field_name
    row_id_idx = 0
    field_name_idx = 1
    # NOTE: row is a tuple of row_id and field_name
    for row in table.cursor(['row_id', field_name]):
        # the type of the field value for the current row is 'NoneType', empty,
        # or 'nan'
        if row[field_name_idx] is None or row[field_name_idx] == '':
            if operator == 'is_absent':
                ids.add(row[row_id_idx])
            if keep_novalue == 'True' and operator != 'is_present':
                ids.add(row[row_id_idx])
            else:
                pass
        # the type of the field value for the current row is 'str'
        elif isinstance(row[field_name_idx], str):
            # with "is_present", report variant if at least one value exists
            if operator == 'is_present':
                if row[field_name_idx].replace('nan', '').replace(',',
                                                                  '') != '':
                    ids.add(row[row_id_idx])
            # with "is_absent", report variant if it contains only 'nan' values
            elif operator == 'is_absent':
                if row[field_name_idx].replace('nan', '').replace(',',
                                                                  '') == '':
                    ids.add(row[row_id_idx])
            elif operator == 'greater_than' or operator == 'less_than':
                # special case: NUM/NUM (which is recognised as string by wormtable)
                # solution: we check that the ratio NUM/NUM is >,<,= cutoff
                for value in row[field_name_idx].split(','):
                    if value == '' or value == 'nan':
                        if keep_novalue == 'True':
                            ids.add(row[row_id_idx])
                            break
                    elif value.find('/') != -1:
                        if operator == 'greater_than':
                            if float(value.split('/')[0]) / float(
                                    value.split('/')[1]) > float(cutoff):
                                ids.add(row[row_id_idx])
                                break
                        elif operator == 'less_than':
                            if float(value.split('/')[0]) / float(
                                    value.split('/')[1]) < float(cutoff):
                                ids.add(row[row_id_idx])
                                break
                    else:
                        sys.stderr.write('\nError: ' + operator +
                                         ' incompatible with' +
                                         ' field type (string).\n')
                        sys.exit()
            elif operator == 'equal_to':
                for value in row[field_name_idx].split(','):
                    if value == '' or value == 'nan':
                        if keep_novalue == 'True':
                            ids.add(row[row_id_idx])
                            break
                    # special case: NUM/NUM (which is recognised as string by wormtable)
                    # solution: we check that the ratio NUM/NUM is >,<,= cutoff
                    elif value.find('/') != -1:
                        if float(value.split('/')[0]) / float(
                                value.split('/')[1]) == float(cutoff):
                            ids.add(row[row_id_idx])
                            break
                    elif value == cutoff:
                        ids.add(row[row_id_idx])
                        break
            elif operator == 'contains_keyword':
                for value in row[field_name_idx].split(','):
                    if value == '' or value == 'nan':
                        if keep_novalue == 'True':
                            ids.add(row[row_id_idx])
                            break
                    for keyword in set(cutoff.split(',')):
                        if value.find(keyword) != -1:
                            ids.add(row[row_id_idx])
                            break
        # the type of the field value for the current row is 'tuple'
        elif isinstance(row[field_name_idx], tuple):
            # with "is_present", report variant if at least one value exists
            if operator == 'is_present':
                if set(map(str, row[field_name_idx])) != set(['nan']):
                    ids.add(row[row_id_idx])
            # with "is_absent", report variant if it contains only 'nan' values
            elif operator == 'is_absent':
                if set(map(str, row[field_name_idx])) == set(['nan']):
                    ids.add(row[row_id_idx])
            elif operator == 'greater_than':
                for value in row[field_name_idx]:
                    if math.isnan(value):
                        if keep_novalue == 'True':
                            ids.add(row[row_id_idx])
                            break
                    elif value > float(cutoff):
                        ids.add(row[row_id_idx])
                        break
            elif operator == 'less_than':
                for value in row[field_name_idx]:
                    if math.isnan(value):
                        if keep_novalue == 'True':
                            ids.add(row[row_id_idx])
                            break
                    elif value < float(cutoff):
                        ids.add(row[row_id_idx])
                        break
            elif operator == 'equal_to':
                for value in row[field_name_idx]:
                    if math.isnan(value):
                        if keep_novalue == 'True':
                            ids.add(row[row_id_idx])
                            break
                    elif value == float(cutoff):
                        ids.add(row[row_id_idx])
                        break
            elif operator == 'contains_keyword':
                for value in row[field_name_idx]:
                    if math.isnan(value):
                        if keep_novalue == 'True':
                            ids.add(row[row_id_idx])
                            break
                    for keyword in set(cutoff.split(',')):
                        try:
                            if value.find(keyword) != -1:
                                ids.add(row[row_id_idx])
                                break
                        except AttributeError:
                            sys.stderr.write('\nError: ' + operator +
                                             ' incompatible with' +
                                             ' field type (tuple).\n')
                            raise
        # the type of the field value for the current row is 'int' or 'float'
        # this includes cases of string numbers (e.g. '1234')
        elif is_number(row[field_name_idx]):
            if operator == 'is_present':
                ids.add(row[row_id_idx])
            elif operator == 'greater_than':
                if row[field_name_idx] > float(cutoff):
                    ids.add(row[row_id_idx])
            elif operator == 'less_than':
                if row[field_name_idx] < float(cutoff):
                    ids.add(row[row_id_idx])
            elif operator == 'equal_to':
                if row[field_name_idx] == float(cutoff):
                    ids.add(row[row_id_idx])
            elif operator == 'contains_keyword':
                for keyword in set(cutoff.split(',')):
                    try:
                        if row[field_name_idx].find(keyword) != -1:
                            ids.add(row[row_id_idx])
                            break
                    except AttributeError:
                        sys.stderr.write('\nError: ' + operator +
                                         ' incompatible with' +
                                         ' field type (number).\n')
                        raise

    # close table
    table.close()
    return ids
Beispiel #50
0
 def setUp(self):
     super(WtadminTest, self).setUp()
     vcf2wt.main([EXAMPLE_VCF, self._homedir, "-fq"])
     self._table = wt.open_table(self._homedir)
Beispiel #51
0
 def setUp(self):
     super(WtadminTest, self).setUp()
     wt.vcf2wt_main([EXAMPLE_VCF, self._homedir, "-fq"])
     self._table = wt.open_table(self._homedir)
Beispiel #52
0
 def __init__(self, homedir, chrs, cols, wsize=10000, db_cache_size="256M"):
     self.__table = wt.open_table(homedir, db_cache_size=db_cache_size)
     self.__index = self.__table.open_index("CHROM+POS", db_cache_size=db_cache_size)
     self.__wsize = wsize
     self.__chrs = chrs
     self.__cols = cols