def test_makes_df(self): from idiva.clf.df import v0_df from idiva.io import ReadVCF from idiva.utils import seek_then_rewind for k in PATHS: with PATHS[k].open(mode='r') as fd: assert isinstance(fd, io.TextIOBase) with seek_then_rewind(fd): datalines = list(ReadVCF(fd)) with seek_then_rewind(fd): df = v0_df(ReadVCF(fd)) self.assertEqual(len(datalines), len(df))
def test_reads_all_lines(self): for k in PATHS: with open_maybe_gz(PATHS[k], mode='r') as fd: vcf = ReadVCF(fd) from idiva.utils import seek_then_rewind with seek_then_rewind(fd, seek=None): reference = len(fd.readlines()) with vcf.rewind_when_done: candidate = len(list(vcf)) self.assertEqual(candidate, reference)
def check_all(fd: typing.TextIO): from idiva.utils import seek_then_rewind try: with seek_then_rewind(fd): bag_of_assumptions.alt_column(fd) except (AssertionError, RuntimeError): yield {'alt_column': False} else: yield {'alt_column': True} try: with seek_then_rewind(fd): bag_of_assumptions.id_is_unique(fd) except (AssertionError, RuntimeError): yield {'The ID column is_unique': False} else: yield {'The ID column is_unique': True} try: with seek_then_rewind(fd): bag_of_assumptions.samples_column(fd) except (AssertionError, RuntimeError): yield {'samples_column': False} else: yield {'samples_column': True} try: with seek_then_rewind(fd): bag_of_assumptions.ref_column(fd) except (AssertionError, RuntimeError): yield {'ref_column': False} else: yield {'ref_column': True} try: with seek_then_rewind(fd): bag_of_assumptions.format_is_gt(fd) except (AssertionError, RuntimeError): yield {'format_is_gt': False} else: yield {'format_is_gt': True}
def test_md5_head(self): from idiva.io import ReadVCF, open_maybe_gz from idiva.utils import seek_then_rewind for (k, file) in PATHS_LARGE_HEAD.items(): with open_maybe_gz(file, mode='r') as fd: assert isinstance(fd, io.TextIOBase) with seek_then_rewind(fd, seek=0): import hashlib reference = hashlib.md5(fd.read().encode()).hexdigest() candidate = ReadVCF(fd).md5 self.assertEqual(candidate, reference)
def head(src: io.TextIOBase, n=100) -> io.TextIOBase: from idiva.utils import seek_then_rewind from tempfile import NamedTemporaryFile with NamedTemporaryFile(mode='w') as tf: with seek_then_rewind(src): # Write meta and header line = "##" while line.startswith("##"): line = src.readline().strip() print(line, file=tf) assert line.startswith("#") # Write a few datalines for _ in range(n): line = src.readline().strip() assert not line.startswith("#") print(line, file=tf, flush=True) with open(tf.name, mode='r') as fd: yield fd
def open_maybe_gz(file, *, mode='r') -> typing.Union[io.TextIOBase, io.BytesIO]: """ Open `file` for reading that could be a - file descriptor - path to file - path to gzipped file `mode` is either 'r' or 'rb', and has to be specified. Usage: with open_maybe_gz(path_to_file, mode='r') as fd: print(fd.read()) """ assert mode in ['r', 'rb'] if isinstance(file, io.TextIOBase): assert (mode == 'r'), "Can't convert TextIOBase to mode='rb'." yield file return if isinstance(file, io.IOBase): assert isinstance(file, (io.BufferedIOBase, io.TextIOBase)) with contextlib.ExitStack() as stack: import gzip try: with seek_then_rewind(file, seek=0): with gzip.open(file) as test: test.read(2) except gzip.BadGzipFile: # Assume this is not a gzipped stream pass else: # gzip didn't complain # interpret this as a gzipped stream file = stack.enter_context(gzip.open(file)) if (mode == 'r'): if not isinstance(file, io.TextIOBase): assert isinstance(file, io.BufferedIOBase) file = stack.enter_context(io.TextIOWrapper(file)) yield file return if isinstance(file, str) and re.match(r"^(http|https|ftp)://", file): from idiva.download import download with download(file).now.open(mode='rb') as fd: with open_maybe_gz(fd, mode=mode) as fd: yield fd return from pathlib import Path assert Path(file).is_file() file = str(file) if file.endswith(".gz"): import gzip with gzip.open(file, mode='rb') as fd: if (mode == 'r'): yield io.TextIOWrapper(fd) elif (mode == 'rb'): yield fd else: with open(file, mode=mode) as fd: yield fd
def rewind_when_done(self): from idiva.utils import seek_then_rewind with seek_then_rewind(self.fd, seek=None): yield
def preload_all(self): from idiva.utils import seek_then_rewind with seek_then_rewind(self.fd, seek=None): self.datalines = list(iter(self.datalines)) return self
def feature_extraction_batch(self, reader_ctrl: ReadVCF, reader_case: ReadVCF, names_ctrl: List[str], names_case: List[str], batches_ctrl: List[int], batches_case: List[int], idx: int, clf, id: List[int]): """ Returns a trained classifier on one batch loads from both files some patients (one batch) for training """ with seek_then_rewind(reader_ctrl.fd, seek=reader_ctrl.dataline_start_pos) as fd_ctrl: with seek_then_rewind(reader_case.fd, seek=reader_case.dataline_start_pos) as fd_case: batch_names_ctrl = names_ctrl[:3] batch_names_case = names_case[:3] batch_names_ctrl.extend(names_ctrl[batches_ctrl[idx] + 3:batches_ctrl[idx + 1] + 3]) batch_names_case.extend(names_case[batches_case[idx] + 3:batches_case[idx + 1] + 3]) batch_columns_ctrl = [0, 1, 4] batch_columns_case = [0, 1, 4] batch_columns_ctrl.extend(list(range(batches_ctrl[idx] + 9, batches_ctrl[idx + 1] + 9))) batch_columns_case.extend(list(range(batches_case[idx] + 9, batches_case[idx + 1] + 9))) converter_dict_ctrl = {} for column in batch_names_ctrl: if column not in ['CHROM', 'POS', 'ALT']: converter_dict_ctrl[column] = self.convert_strang converter_dict_case = {} for column in batch_names_case: if column not in ['CHROM', 'POS', 'ALT']: converter_dict_case[column] = self.convert_strang dataframe_ctrl = pd.read_csv(fd_ctrl, sep='\t', header=None, usecols=batch_columns_ctrl, names=batch_names_ctrl, converters=converter_dict_ctrl) dataframe_ctrl = dataframe_ctrl.drop_duplicates(['CHROM', 'POS', 'ALT'], keep='first') dataframe_ctrl['ID'] = dataframe_ctrl[['CHROM', 'POS', 'ALT']].apply(index_map, axis=1) dataframe_ctrl = dataframe_ctrl.drop(['CHROM', 'POS', 'ALT'], axis=1) dataframe_ctrl = dataframe_ctrl.set_index('ID') dataframe_ctrl = dataframe_ctrl.transpose() dataframe_ctrl = dataframe_ctrl.reindex(columns=id, fill_value=4) dataframe_case = pd.read_csv(fd_case, sep='\t', header=None, usecols=batch_columns_case, names=batch_names_case, converters=converter_dict_case) dataframe_case = dataframe_case.drop_duplicates(['CHROM', 'POS', 'ALT'], keep='first') dataframe_case['ID'] = dataframe_case[['CHROM', 'POS', 'ALT']].apply(index_map, axis=1) dataframe_case = dataframe_case.drop(['CHROM', 'POS', 'ALT'], axis=1) dataframe_case = dataframe_case.set_index('ID') dataframe_case = dataframe_case.transpose() dataframe_case = dataframe_case.reindex(columns=id, fill_value=4) labels = np.zeros(dataframe_ctrl.shape[0]) labels = np.append(labels, np.ones(dataframe_case.shape[0])) dataframe = dataframe_ctrl.append(dataframe_case) dataframe, labels = shuffle(dataframe, labels, random_state=0) """ # for Random Forest Classifier clf.n_estimators += 10000 clf.fit(dataframe, labels) """ clf.partial_fit(dataframe, labels, classes=[0, 1]) return clf
loaded_model = pickle.load(open(filename, 'rb')) else: warnings.warn("no model saved") loaded_model = DummyClassifier() return loaded_model def align(case: ReadVCF, ctrl: ReadVCF): """ aligning case and control vcf file by joining on chrom, pos, ref and alt """ from idiva.utils import seek_then_rewind dfs = {} for (k, vcf) in zip(['case', 'ctrl'], [case, ctrl]): with seek_then_rewind(vcf.fd, seek=vcf.dataline_start_pos) as fd: dfs[k] = pd.read_csv(fd, sep='\t', usecols=[0, 1, 2, 3, 4], header=None, names=["CHROM", "POS", "ID", "REF", "ALT"]) dfs[k].index = dfs[k].index.rename(name="rowid") dfs[k] = dfs[k].reset_index().astype({'rowid': 'Int64'}) dfs['case'] = dfs['case'].drop_duplicates(['CHROM', 'POS', 'REF', 'ALT'], keep='first') dfs['ctrl'] = dfs['ctrl'].drop_duplicates(['CHROM', 'POS', 'REF', 'ALT'], keep='first') df = join(case=dfs['case'], ctrl=dfs['ctrl']) df['CHROM'] = pd.to_numeric(df[['CHROM']].apply(translate_chrom, axis=1)) df['CPA_ID'] = df[['CHROM', 'POS', 'ALT']].apply(index_map, axis=1) df = df.set_index('CPA_ID')
def translate_vcf(self, vcf) -> pd.DataFrame: """ Returns a dataframe that contains the following features from a vcf file CHROM, POS, ID, VAR """ cache = (Path(__file__).parent.parent.parent.parent / "input/download_cache").resolve() assert cache.is_dir() with ReadVCF.open(vcf) as reader: with seek_then_rewind(reader.fd, seek=reader.dataline_start_pos) as fd: dataframe = pd.read_csv(fd, sep='\t', usecols=range( len(DataHandler.INIT_COLS)), header=None, names=DataHandler.INIT_COLS, dtype={ 'CHROM': np.int, 'POS': np.int, 'ID': np.str, 'REF': np.str, 'ALT': np.str }) # Check if ALT contains only one value or several values seperated by ',' assert (len([ uni for uni in dataframe['ALT'].unique().tolist() if ',' in uni ]) == 0) # store only SNP variants dataframe = dataframe[dataframe['REF'].apply( lambda x: {x}.issubset({'A', 'C', 'G', 'T'}))] dataframe = dataframe[dataframe['ALT'].apply( lambda x: {x}.issubset({'A', 'C', 'G', 'T'}))] # Check if only SNP for ref in dataframe['REF']: assert (len(ref) == 1) for alt in dataframe['ALT']: assert (len(alt) == 1) assert (set(dataframe['REF'].unique().tolist()).issubset( {'A', 'C', 'G', 'T'})) assert (set(dataframe['ALT'].unique().tolist()).issubset( {'A', 'C', 'G', 'T'})) dataframe['CHROM'] = pd.to_numeric(dataframe[['CHROM']].apply( self.translate_chrom, axis=1)) dataframe = self.encode_ref_alt(dataframe) dataframe.drop_duplicates() # TODO: same CHROM POS and rsID but not same REF & ALT # consequence of real world data (Kjong Nov 30) # => identify samples by CHROM, POS and VAR # same CHROM rsID REF ALT but not same POS # => rsIDs are not completely unique ! # Ignore rsID (Kjong Nov 23) """ print(len(dataframe['ID'].unique().tolist())) print(len(dataframe['ID'].tolist())) CHROM POS ID REF ALT VAR 56638 17 1649616 rs544719440 A G 2 576511 17 19159733 rs540831825 A G 2 717227 17 27196477 rs202111951 T C 10 919995 17 34642425 rs568794696 C A 3 2105598 17 77663493 rs148485780 C T 5 CHROM POS ID REF ALT VAR 56637 17 1649616 rs544719440 A C 1 576510 17 19159733 rs540831825 A C 1 717226 17 27196477 rs202111951 T A 9 919587 17 34540858 rs568794696 C A 3 2105592 17 77663435 rs148485780 C T 5 """ return dataframe