Example #1
0
 def __enter__(self):
     self.outfile = helpers.GetFileHandle(self.outfile, 'wt').handler
     self.snpeff_outfile = helpers.GetFileHandle(self.snpeff_outfile, 'wt').handler
     self.ma_outfile = helpers.GetFileHandle(self.ma_outfile, 'wt').handler
     self.ids_outfile = helpers.GetFileHandle(self.ids_outfile, 'wt').handler
     self.write_headers()
     return self
Example #2
0
def concatenate_vcf(infiles, outfile):
    '''
    Concatenate VCF files

    :param infiles: dictionary of input VCF files to be concatenated
    :param outfile: output VCF file
    '''
    if isinstance(infiles, dict):
        keys = infiles.keys()
        keys = sorted(keys)
        infiles = [infiles[val] for val in keys]

    with helpers.GetFileHandle(outfile, 'w') as ofile:
        header = None

        for ifile in infiles:

            if os.path.getsize(ifile) == 0:
                warnings.warn('input file {} is empty'.format(ifile))
                continue

            with helpers.GetFileHandle(ifile) as f:

                if not header:
                    header = _get_header(f)

                    for line in header:
                        ofile.write(line)
                else:
                    if not _get_header(f) == header:
                        warnings.warn(
                            'merging vcf files with mismatching headers')

                for l in f:
                    ofile.write(l)
Example #3
0
    def write_headerless_csv(self, infile):
        with helpers.GetFileHandle(self.filepath, 'wt') as writer:
            with helpers.GetFileHandle(infile) as reader:
                if not reader.readline() == self.header_line:
                    raise CsvWriterError("cannot write, wrong header")
                self.write_csv_data(reader, writer)

        self.__write_yaml()
Example #4
0
    def write_csv_with_header(self, infile, headerless_input=True):
        with helpers.GetFileHandle(self.filepath, 'wt') as writer:
            with helpers.GetFileHandle(infile) as reader:
                if headerless_input:
                    writer.write(self.header_line)
                self.write_csv_data(reader, writer)

        self.__write_yaml()
Example #5
0
    def concatenate_files(self, infiles):
        header = self.header_line if self.header else None

        with helpers.GetFileHandle(self.filepath, 'wt') as writer:
            if header:
                writer.write(header)
            for infile in infiles:
                with helpers.GetFileHandle(infile) as reader:
                    self.write_csv_data(reader, writer)

        self.__write_yaml()
Example #6
0
def update_germline_header_sample_ids(infile, outfile, sample_id):
    with helpers.GetFileHandle(infile) as indata:
        with helpers.GetFileHandle(outfile, 'wt') as outdata:
            for line in indata:
                if line.startswith('#CHROM'):
                    outdata.write('##normal_sample={}\n'.format(sample_id))
                    line = line.strip().split()
                    assert line[-1] in ['normal', sample_id]
                    line[-1] = sample_id
                    line = '\t'.join(line) + '\n'
                    outdata.write(line)
                else:
                    outdata.write(line)
Example #7
0
 def generate_metadata(self):
     with helpers.GetFileHandle(self.filepath) as inputfile:
         header = inputfile.readline().strip()
         sep = self.__detect_sep_from_header(header)
         columns = header.split(sep)
         header = True
         dtypes = self.__generate_dtypes(sep=sep)
         return header, sep, dtypes, columns
Example #8
0
def get_vcf_header(vcf_file):
    vcf_data = []
    with helpers.GetFileHandle(vcf_file) as vcf_file:
        for line in vcf_file:
            if line.startswith('#'):
                vcf_data.append(line)
                continue
            break
    return vcf_data
Example #9
0
    def __write_yaml(self):

        yamldata = {'header': self.header, 'sep': self.sep, 'columns': []}

        for column in self.columns:
            data = {'name': column, 'dtype': self.dtypes[column]}
            yamldata['columns'].append(data)

        with helpers.GetFileHandle(self.yaml_file, 'wt') as f:
            yaml.safe_dump(yamldata, f, default_flow_style=False)
Example #10
0
def reheader_reads(infile, outfile):
    with helpers.GetFileHandle(infile, 'rt') as indata, helpers.GetFileHandle(outfile, 'wt') as outdata:
        line_one = indata.readline()

        header = line_one.split('\t')

        assert len(header) == 8

        assert not header[0] == 'prediction_id'

        header = ['prediction_id', 'library_id', 'fragment_id', 'read_end', 'seq', 'qual', 'comment', 'filtered']

        header = '\t'.join(header) + '\n'

        outdata.write(header)
        outdata.write(line_one)

        for line in indata:
            outdata.write(line)
Example #11
0
def merge_mafs(maf_files, output):

    if isinstance(maf_files, dict):
        maf_files = list(maf_files.values())

    with helpers.GetFileHandle(output, 'wt') as maf_writer:

        with helpers.GetFileHandle(maf_files[0]) as header_read:
            header = header_read.readline()
            assert header.startswith('#version 2.4')
            maf_writer.write(header)

            header = header_read.readline()
            assert header.startswith('Hugo_Symbol')
            maf_writer.write(header)

        for filepath in maf_files:
            with helpers.GetFileHandle(filepath, 'rt') as maf_reader:
                for line in maf_reader:
                    if line.startswith('Hugo_Symbol') or line.startswith('#'):
                        continue
                    maf_writer.write(line)
Example #12
0
def main(infile, outfile, mappability_blacklist):
    blacklist = load_blacklist(mappability_blacklist)

    vcf_header = get_vcf_header(infile)
    vcf_header = update_vcf_header(vcf_header, mappability_blacklist)

    with helpers.GetFileHandle(outfile, 'wt') as vcf_writer:
        write_to_file(vcf_writer, vcf_header)

        for vcf_data in load_vcf_file(infile):
            annotated_vcf_data = annotate_vcf_data(vcf_data, blacklist)

            write_to_file(vcf_writer, annotated_vcf_data)
Example #13
0
def load_vcf_file(vcf_file):
    vcf_data = []
    with helpers.GetFileHandle(vcf_file) as vcf_file:
        for line in vcf_file:
            if line.startswith('#'):
                continue

            line = line.strip().split()

            vcf_data.append(line)

            if len(vcf_data) > 1e6:
                yield vcf_data
                vcf_data = []
    yield vcf_data
Example #14
0
    def __parse_metadata(self):
        with helpers.GetFileHandle(self.filepath + '.yaml') as yamlfile:
            yamldata = yaml.safe_load(yamlfile)

        header = yamldata['header']
        sep = yamldata.get('sep', ',')

        dtypes = {}
        columns = []
        for coldata in yamldata['columns']:
            colname = coldata['name']

            dtypes[colname] = coldata['dtype']

            columns.append(colname)

        return header, sep, dtypes, columns
Example #15
0
def parse_roh_output(infile, outfile):
    parsed = []

    with helpers.GetFileHandle(infile) as indata:
        for line in indata:
            if line.startswith('#'):
                continue

            line = line.strip().split()

            if line[0] == 'ST':
                parsed.append({
                    'type': line[0],
                    'sample': line[1],
                    'chromosome': line[2],
                    'start': line[3],
                    'end': float('nan'),
                    'state': line[4],
                    'length': float('nan'),
                    'num_markers': float('nan'),
                    'quality': line[5]
                })
            elif line[0] == 'RG':
                parsed.append({
                    'type': line[0],
                    'sample': line[1],
                    'chromosome': line[2],
                    'start': line[3],
                    'end': line[4],
                    'state': float('nan'),
                    'length': line[5],
                    'num_markers': line[6],
                    'quality': line[7]
                })

    parsed = pd.DataFrame(parsed)

    csvutils.write_dataframe_to_csv_and_yaml(parsed,
                                             outfile,
                                             write_header=True)
Example #16
0
def select_optimal_solution(
    chunks,
    params_files,
    segments,
    igv_segs,
    markers,
    parsed_files,
    plots,
    optimal_segment,
    optimal_igv_segs,
    optimal_param,
    optimal_marker,
    optimal_parsed,
    optimal_plot,
):
    '''
    selects the optimal cluster and ploidy
    combination from an input set of cluster/ploidy-
    resolved params and writes the corresponding
    set of optimal output files to an 'optimal'
    output directory.

    :params nclusts: number of clusters / sample
    :params nploidy: ploidy options/cluster/sample
    :params params_files: set of paramater files
        per ploidy/cluster
    :params segments: input set of output segments
        files per ploidy/cluster
    :params params: input set of output params
        files per ploidy/cluster
    :params markers: input set of output markers
        files per ploidy/cluster
    :params parsed_files: input set of parsed
        files per ploidy/cluster
    :params plots: input set of plots
        files per ploidy/cluster
    :params optimal_segment: output path
        for the optimal segment file
    :params optimal_param: output path
        for the optimal param file
    :params optimal_marker: output path
        for optimal marker file
    :params optimal_parsed: output path
        for optimal parsed file
    :params optimal_plots: output path
        for optimal plots file
    '''

    model_select_idxs = []

    # find optimal cluster/ploidy
    for chunk in chunks:
        params = params_files[chunk]
        parsed_params = get_param_file_vals(params)
        dbw_index = parsed_params['S_Dbw validity index (Both)'][0]
        model_select_idxs.append((chunk, dbw_index))

    best_model = min(model_select_idxs, key=lambda t: t[1])
    best_model = best_model[0]

    # copy the file at the optimal cluster/ploidy to the
    # optimal file output path
    csvutils.finalize_csv(segments[best_model], optimal_segment, sep='\t')
    csvutils.finalize_csv(params_files[best_model], optimal_param, sep='\t')
    csvutils.finalize_csv(markers[best_model], optimal_marker, sep='\t')
    csvutils.finalize_csv(parsed_files[best_model], optimal_parsed, sep='\t')
    shutil.copyfile(plots[best_model], optimal_plot)
    shutil.copyfile(igv_segs[best_model], optimal_igv_segs)

    with helpers.GetFileHandle(optimal_param, 'at') as params_output:
        ploidy, num_clusters = best_model
        params_output.write('ploidy: {}\n'.format(ploidy))
        params_output.write('num_clusters: {}\n'.format(num_clusters))
Example #17
0
 def __detect_sep_from_file(self):
     with helpers.GetFileHandle(self.filepath) as reader:
         header = reader.readline().strip()
         return self.__detect_sep_from_header(header)
Example #18
0
def split_vcf_by_chr(vcf_file, chromosome, output):
    with helpers.GetFileHandle(vcf_file, 'rt') as vcf_reader, \
            helpers.GetFileHandle(output, 'wt') as vcf_writer:
        for line in vcf_reader:
            if line.startswith('#') or line.startswith(chromosome):
                vcf_writer.write(line)