Beispiel #1
0
def VariantFileWriter(filepath: str,
                      allow_extra_fields: bool = False,
                      use_gzip: bool = True):
    '''
    Writes variants (represented by dictionaries) to an internal file.

        with VariantFileWriter('a.tsv') as writer:
            writer.write({'chrom': '2', 'pos': 47, ...})

    Each variant/association/hit/loci written must have a subset of the keys of the first one.
    '''
    part_file = get_tmp_path(filepath)
    make_basedir(filepath)
    if use_gzip:
        with AtomicSaver(filepath,
                         text_mode=False,
                         part_file=part_file,
                         overwrite_part=True,
                         rm_part_on_exc=False) as f:
            with gzip.open(f, 'wt', compresslevel=2) as f_gzip:
                yield _vfw(f_gzip, allow_extra_fields, filepath)
    else:
        with AtomicSaver(filepath,
                         text_mode=True,
                         part_file=part_file,
                         overwrite_part=True,
                         rm_part_on_exc=False) as f:
            yield _vfw(f, allow_extra_fields, filepath)
Beispiel #2
0
def annotate_trait_descriptions(in_filepath,
                                out_filepath,
                                phenolist_path=None):
    """
    Annotate a phenotype correlation file with an additional "Trait2Label" description (where possible)
    FIXME: This makes simplistic assumptions about file format/contents, and performs no validation
    """
    # Initial file format spec (per SarahGT) is a tab-delimited format:
    #   Trait1  Trait2  rg  SE  Z  P-value  Method

    pheno_labels = {
        pheno['phenocode']: pheno.get('phenostring', pheno['phenocode'])
        for pheno in get_phenolist(filepath=phenolist_path)
    }

    with open(in_filepath,
              'r') as in_f, AtomicSaver(out_filepath,
                                        text_mode=True,
                                        part_file=get_tmp_path(out_filepath),
                                        overwrite_part=True) as out_f:

        headers = in_f.readline().strip()
        out_f.write(headers + '\tTrait2Label\n')

        for line in in_f:
            line = line.strip()
            trait1_code, trait2_code, _ = line.split('\t', maxsplit=2)
            if trait2_code not in pheno_labels:
                logger.warning(
                    'Correlation file specifies an unknown phenocode; value will be skipped: "{}"'
                    .format(trait2_code))
                continue

            out_f.write(line + '\t{}\n'.format(pheno_labels[trait2_code]))
Beispiel #3
0
def make_symmetric(in_filepath, out_filepath):
    '''
    The output of pheweb-rg-pipeline includes the line
        traitA traitB 0.4 0.1 2 1e-3 ldsc
    but it omits the line
        traitB traitA 0.4 0.1 2 1e-3 ldsc
    so this function adds that second line for the symmetric position in the correlation matrix.
    '''
    # TODO: first check if the file already contains lines going both directions
    expected_colnames = [
        'Trait1', 'Trait2', 'rg', 'SE', 'Z', 'P-value', 'Method'
    ]
    with open(in_filepath) as in_f:
        header = next(in_f)
        assert header.rstrip().split() == expected_colnames
        correlations = []
        for line in in_f:
            corr = line.split(maxsplit=2)
            correlations.append(corr)
            correlations.append([corr[1], corr[0], corr[2]])
        correlations.sort()

    with AtomicSaver(out_filepath,
                     text_mode=True,
                     part_file=get_tmp_path(out_filepath),
                     overwrite_part=True) as out_f:
        out_f.write(header)
        for trait1, trait2, rest_of_line in correlations:
            out_f.write(trait1 + '\t' + trait2 + '\t' + rest_of_line)
Beispiel #4
0
def make_symmetric(in_filepath, out_filepath):
    '''
    The output of pheweb-rg-pipeline includes the line
        traitA traitB 0.4 0.1 2 1e-3 ldsc
    but it omits the line
        traitB traitA 0.4 0.1 2 1e-3 ldsc
    so this function adds that second line for the symmetric position in the correlation matrix.
    If the file already has both directions for some or all pairs of traits, that's okay.
    '''
    expected_colnames = [
        'Trait1', 'Trait2', 'rg', 'SE', 'Z', 'P-value', 'Method'
    ]
    trait_pairs_seen = set()
    with open(in_filepath) as in_f:
        header = next(in_f)
        assert header.rstrip().split('\t') == expected_colnames
        correlations = []
        for line in in_f:
            trait1, trait2, rest_of_line = line.split('\t', maxsplit=2)
            trait_pairs_seen.add((trait1, trait2))
            correlations.append((trait1, trait2, rest_of_line))

    for trait1, trait2, rest_of_line in correlations:
        if (trait2, trait1) not in trait_pairs_seen:
            correlations.append((trait2, trait1, rest_of_line))

    correlations.sort()

    with AtomicSaver(out_filepath,
                     text_mode=True,
                     part_file=get_tmp_path(out_filepath),
                     overwrite_part=True) as out_f:
        out_f.write(header)
        for trait1, trait2, rest_of_line in correlations:
            out_f.write(trait1 + '\t' + trait2 + '\t' + rest_of_line)
Beispiel #5
0
def make_json_file(args):
    src_filename, dest_filename, tmp_filename = args['src'], args[
        'dest'], args['tmp']
    try:

        with open(src_filename) as f:
            variants = list(get_variants(f, fname=src_filename))

        rv = {}
        if variants:
            rv['overall'] = make_qq(v.neglog10_pval for v in variants)
            rv['by_maf'] = make_qq_stratified(variants)

        with AtomicSaver(dest_filename,
                         text_mode=True,
                         part_file=tmp_filename,
                         overwrite_part=True) as f:
            json.dump(rv, f)
        print('{}\t{} -> {}'.format(datetime.datetime.now(), src_filename,
                                    dest_filename))

    except Exception as exc:
        print(
            'ERROR OCCURRED WHEN MAKING QQ FILE {!r} FROM FILE {!r} (TMP FILE AT {!r})'
            .format(dest_filename, src_filename, tmp_filename))
        print('ERROR WAS:')
        print(exc)
        print('---')
        raise
Beispiel #6
0
def write_json(*, filepath=None, data=None, indent=None, sort_keys=False):
    assert filepath is not None and data is not None, filepath
    part_file = get_tmp_path(filepath)
    make_basedir(filepath)
    with AtomicSaver(filepath,
                     text_mode=True,
                     part_file=part_file,
                     overwrite_part=True,
                     rm_part_on_exc=False) as f:
        json.dump(data, f, indent=indent, sort_keys=sort_keys)
Beispiel #7
0
def annotate_genes(file_to_annotate, temp_file, output_file):
    '''All four args are filepaths'''
    ga = GeneAnnotator(utils.get_gene_tuples())
    with open(file_to_annotate) as in_f, \
         AtomicSaver(output_file, text_mode=True, part_file=temp_file, overwrite_part=True) as out_f:
        for line in in_f:
            line = line.rstrip('\n\r')
            fields = line.split('\t')
            chrom, pos = fields[0], int(fields[1])
            nearest_gene = ga.annotate_position(chrom, pos)
            out_f.write(line + '\t' + nearest_gene + '\n')
Beispiel #8
0
    def save(self, content: dict) -> Path:
        """Save current config.

        Args:
            content (dict): content to write to file.

        Returns:
            Path: path to config file.

        """
        config = json.dumps(content, indent=4)
        with AtomicSaver((str(self.file_path)), text_mode=True) as file:
            file.write(config)
        return self.file_path
Beispiel #9
0
def VariantFileWriter(filepath, allow_extra_fields=False):
    '''
    Writes variants (represented by dictionaries) to an internal file.

        with VariantFileWriter('a.tsv') as writer:
            writer.write({'chrom': '2', 'pos': 47, ...})
    '''
    part_file = get_tmp_path(filepath)
    make_basedir(filepath)
    with AtomicSaver(filepath,
                     text_mode=True,
                     part_file=part_file,
                     overwrite_part=True,
                     rm_part_on_exc=False) as f:
        yield _vfw(f, allow_extra_fields, filepath)
Beispiel #10
0
def convert(pheno, dest_filename, tmp_filename):

    with AtomicSaver(dest_filename, text_mode=True, part_file=tmp_filename, overwrite_part=True) as f_out:

        minimum_maf = conf.minimum_maf if hasattr(conf, 'minimum_maf') else 0
        fieldnames, variants = input_file_parser.get_fieldnames_and_variants(pheno, minimum_maf=minimum_maf)

        req_fieldnames = 'chrom pos ref alt'.split()
        for fld in req_fieldnames: assert fld in fieldnames, fld
        variants = ({k:v for k,v in variant.items() if k in req_fieldnames} for variant in variants)

        writer = csv.DictWriter(f_out, fieldnames=req_fieldnames, delimiter='\t')
        writer.writeheader()
        writer.writerows(variants)

    print('{}\t{} -> {}'.format(datetime.datetime.now(), pheno['phenocode'], dest_filename))
Beispiel #11
0
def merge(input_filenames, out_filename):
    tmp_filename = '{}/tmp/merging-{}'.format(conf.data_dir, random.randrange(1e10)) # I don't like tempfile.

    with contextlib.ExitStack() as es, \
         AtomicSaver(out_filename, text_mode=True, part_file=tmp_filename, overwrite_part=True) as f_out:
        f_out.write('\t'.join('chrom pos ref alt'.split()) + '\n')

        readers = {}
        for input_filename in input_filenames:
            phenocode = os.path.basename(input_filename)
            file = es.enter_context(open(input_filename))
            readers[phenocode] = order_cpras(convert_to_numeric_chrom(get_cpras(file)))

        # TODO: use heapq
        next_cpras = {}
        for phenocode, reader in readers.items():
            try:
                next_cpra = next(reader)
            except StopIteration:
                print('StopIteration exception occurred for {}'.format(phenocode))
                raise
            else:
                next_cpras.setdefault(next_cpra, list()).append(phenocode)

        n_variants = 0
        while next_cpras:
            assert len(next_cpras) <= len(input_filenames), len(next_cpras)
            n_variants += 1

            next_cpra = min(next_cpras)
            f_out.write('{chrom}\t{1}\t{2}\t{3}\n'.format(*next_cpra, chrom=utils.chrom_order_list[next_cpra[0]]))

            for phenocode in next_cpras.pop(next_cpra):
                try:
                    next_cpra = next(readers[phenocode])
                except StopIteration:
                    del readers[phenocode]
                else:
                    next_cpras.setdefault(next_cpra, []).append(phenocode)

        assert not readers, list(readers.items())

    print('{:8} variants in {} <- {}'.format(n_variants, os.path.basename(out_filename), [os.path.basename(path) for path in input_filenames]))
Beispiel #12
0
def write_json(*,
               filepath: Optional[str] = None,
               data=None,
               indent: Optional[int] = None,
               sort_keys: bool = False) -> None:
    # Don't allow positional args, because I can never remember the order anyways
    assert filepath is not None and data is not None, filepath
    part_file = get_tmp_path(filepath)
    make_basedir(filepath)
    with AtomicSaver(filepath,
                     text_mode=True,
                     part_file=part_file,
                     overwrite_part=True,
                     rm_part_on_exc=False) as f:
        json.dump(data,
                  f,
                  indent=indent,
                  sort_keys=sort_keys,
                  default=_json_writer_default)