Python FileParser.FileParser Examples

Programming Language: Python

Namespace/Package Name: pycoMeth.FileParser

Class/Type: FileParser

Method/Function: FileParser

Examples at hotexamples.com: 4

Python FileParser.FileParser - 4 examples found. These are the top rated real world Python examples of pycoMeth.FileParser.FileParser.FileParser extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

FileParser(4)

next(2)

close(1)

Frequently Used Methods

FileParser (4)

next (2)

close (1)

Example #1

Show file

File: Meth_Comp.py Project: snajder-r/pycoMeth

def bed_intervals_gen(coordgen,
                      interval_bed_fn) -> Generator[Coord, None, None]:
    """Generate coordinate intervals corresponding to the provided bed
    file."""
    with FileParser(
            fn=interval_bed_fn,
            colnames=["chrom", "start", "end"],
            dtypes={
                "start": int,
                "end": int
            },
            force_col_len=False,
            comment="track",
            quiet=True,
    ) as bed:

        prev_ct = None
        for line in bed:
            ct = coordgen(line.chrom, line.start, line.end)
            if prev_ct and ct < prev_ct:
                raise ValueError(
                    "Unsorted coordinate found in bed file {} found after {}. Chromosomes have to be ordered as in fasta reference file"
                    .format(ct, prev_ct))
            prev_ct = ct
            yield (ct)

Example #2

Show file

def Interval_Aggregate(cpg_aggregate_fn: str,
                       ref_fasta_fn: str,
                       interval_bed_fn: str = None,
                       output_bed_fn: str = None,
                       output_tsv_fn: str = None,
                       interval_size: int = 1000,
                       min_cpg_per_interval: int = 5,
                       sample_id: str = "",
                       min_llr: float = 2,
                       verbose: bool = False,
                       quiet: bool = False,
                       progress: bool = False,
                       **kwargs):
    """
    Bin the output of `pycoMeth CpG_Aggregate` in genomic intervals, using either an annotation file containing intervals or a sliding window.
    * cpg_aggregate_fn
        Output tsv file generated by CpG_Aggregate (can be gzipped)
    * ref_fasta_fn
        Reference file used for alignment in Fasta format (ideally already indexed with samtools faidx)
    * interval_bed_fn
        SORTED bed file containing **non-overlapping** intervals to bin CpG data into (Optional) (can be gzipped)
    * output_bed_fn
        Path to write a summary result file in BED format (At least 1 output file is required) (can be gzipped)
    * output_tsv_fn
        Path to write a more extensive result report in TSV format (At least 1 output file is required) (can be gzipped)
    * interval_size
        Size of the sliding window in which to aggregate CpG sites data from if no BED file is provided
    * min_cpg_per_interval
        Minimal number of CpG sites per interval.
    * sample_id
        Sample ID to be used for the BED track header
    * min_llr
        Minimal log likelyhood ratio to consider a site significantly methylated or unmethylated in output BED file
    """
    # Init method
    opt_summary_dict = opt_summary(local_opt=locals())
    log = get_logger(name="pycoMeth_CpG_Comp", verbose=verbose, quiet=quiet)

    log.warning("Checking options and input files")
    log_dict(opt_summary_dict, log.debug, "Options summary")

    # Init collections
    counter = Counter()
    coordgen = CoordGen(ref_fasta_fn, verbose, quiet)
    log_list(coordgen, log.debug, "Coordinate reference summary")

    # At least one output file is required, otherwise it doesn't make any sense
    log.debug("Checking required output")
    if not output_bed_fn and not output_tsv_fn:
        raise pycoMethError("At least 1 output file is requires (-t or -b)")

    # Define type of window generator
    log.debug("Defining interval generator")
    if interval_bed_fn:
        log.debug("Bed annotation generator")
        intervals_gen = bed_intervals_gen(coordgen=coordgen,
                                          interval_bed_fn=interval_bed_fn)
    else:
        log.debug("Sliding window generator")
        intervals_gen = sliding_intervals_gen(coordgen=coordgen,
                                              interval_size=interval_size)

    # Open file parser, sit writter and progress bar
    log.warning("Parsing CpG_aggregate file")
    try:
        fp_in = FileParser(fn=cpg_aggregate_fn,
                           dtypes={
                               "start": int,
                               "end": int,
                               "median_llr": float,
                               "num_motifs": int
                           },
                           verbose=verbose,
                           quiet=quiet,
                           include_byte_len=True)

        if not fp_in.input_type == "CpG_Aggregate":
            raise pycoMethError(
                "Invalid input file type passed (cpg_aggregate_fn). Expecting pycoMeth CpG_Aggregate output TSV file"
            )

        fp_out = Interval_Writer(bed_fn=output_bed_fn,
                                 tsv_fn=output_tsv_fn,
                                 sample_id=sample_id,
                                 min_llr=min_llr,
                                 min_cpg_per_interval=min_cpg_per_interval,
                                 verbose=verbose)
        try:
            with tqdm(total=len(fp_in),
                      unit=" bytes",
                      unit_scale=True,
                      desc="\tProgress",
                      disable=not progress) as pbar:
                # Get first line
                line = fp_in.next()
                line_coord = coordgen(line.chromosome, line.start, line.end)
                counter["Lines parsed"] += 1
                pbar.update(line.byte_len)

                # Get intervals from either the bed generator or the sliding window generator

                for win_coord in intervals_gen:
                    counter["Total number of intervals"] += 1
                    llr_list = []
                    pos_list = []
                    num_motifs = 0

                    while True:
                        # Check if window and center of CpG overlap
                        center = win_coord.center_comp(line_coord)

                        # Center of CpG is greater than current windows = write off current vals and go to next window
                        if center == "greater":
                            fp_out.write(coord=win_coord,
                                         num_motifs=num_motifs,
                                         llr_list=llr_list,
                                         pos_list=pos_list)
                            break

                        # Center of CpG falls inside current windows =  save llr value
                        if center == "inside":
                            llr_list.append(line.median_llr)
                            pos_list.append(int(line_coord.center))
                            num_motifs += line.num_motifs

                        # Center of CpG lower or inside the current windows = keep reading lines
                        line = fp_in.next()
                        line_coord = coordgen(line.chromosome, line.start,
                                              line.end)
                        counter["Lines parsed"] += 1
                        pbar.update(line.byte_len)

        # Stop when reaching end of input file
        except StopIteration:
            # Write last interval
            fp_out.write(coord=win_coord,
                         num_motifs=num_motifs,
                         llr_list=llr_list,
                         pos_list=pos_list)

    finally:
        # Print counters
        log_dict(counter, log.info, "Results summary")
        log_dict(fp_out.counter, log.info, "Writter summary")

        # Close input and output files
        for fp in (fp_in, fp_out):
            try:
                fp.close()
            except:
                pass

Example #3

Show file

def Meth_Comp(aggregate_fn_list: [str],
              ref_fasta_fn: str,
              output_tsv_fn: str = None,
              output_bed_fn: str = None,
              max_missing: int = 0,
              min_diff_llr: float = 2,
              sample_id_list: [str] = None,
              pvalue_adj_method: str = "fdr_bh",
              pvalue_threshold: float = 0.01,
              verbose: bool = False,
              quiet: bool = False,
              progress: bool = False,
              **kwargs):
    """
    Compare methylation values for each CpG positions or intervals between n samples and perform a statistical test to evaluate if the positions are
    significantly different. For 2 samples a Mann_Withney test is performed otherwise multiples samples are compared with a Kruskal Wallis test.
    pValues are adjusted for multiple tests using the Benjamini & Hochberg procedure for controlling the false discovery rate.
    * aggregate_fn_list
        A list of output tsv files corresponding to several samples to compare generated by either CpG_Aggregate or Interval_Aggregate. (can be gzipped)
    * ref_fasta_fn
        Reference file used for alignment in Fasta format (ideally already indexed with samtools faidx)
    * output_bed_fn
        Path to write a summary result file in BED format (At least 1 output file is required) (can be gzipped)
    * output_tsv_fn
        Path to write an more extensive result report in TSV format (At least 1 output file is required) (can be gzipped)
    * max_missing
        Max number of missing samples to perform the test
    * min_diff_llr
        Minimal llr boundary for negative and positive median llr.
        The test if only performed if at least one sample has a median llr above (methylated) and 1 sample has a median llr below (unmethylated)
    * sample_id_list
        list of sample ids to annotate results in tsv file
    * pvalue_adj_method
        Method to use for pValue multiple test adjustment
    * pvalue_threshold
        Alpha parameter (family-wise error rate) for pValue adjustment
    """

    # Init method
    opt_summary_dict = opt_summary(local_opt=locals())
    log = get_logger(name="pycoMeth_CpG_Comp", verbose=verbose, quiet=quiet)

    log.warning("Checking options and input files")
    log_dict(opt_summary_dict, log.debug, "Options summary")

    # Init collections
    coordgen = CoordGen(ref_fasta_fn, verbose, quiet)
    log_list(coordgen, log.debug, "Coordinate reference summary")

    # At least one output file is required, otherwise it doesn't make any sense
    log.debug("Checking required output")
    if not output_bed_fn and not output_tsv_fn:
        raise pycoMethError("At least 1 output file is requires (-t or -b)")

    # Automatically define tests and maximal missing samples depending on number of files to compare
    all_samples = len(aggregate_fn_list)
    min_samples = all_samples - max_missing

    # 3 values = Kruskal Wallis test
    if all_samples >= 3:
        pvalue_method = "KW"
        log.debug("Multiple comparison mode (Kruskal_Wallis test)")
        if min_samples < 3:
            log.debug("Automatically raise number of minimal samples to 3")
            min_samples = 3
    # 2 values = Mann_Withney test
    elif all_samples == 2:
        pvalue_method = "MW"
        log.debug("Pairwise comparison mode (Mann_Withney test)")
        if min_samples:
            log.debug("No missing samples allowed for 2 samples comparison")
            min_samples = 2
    else:
        raise pycoMethError("Meth_Comp needs at least 2 input files")

    log.warning("Parsing files")
    try:
        log.info(
            "Reading input files header and checking consistancy between headers"
        )
        colnames = set()
        input_type = ""
        fp_list = []
        all_fp_len = 0

        if not sample_id_list or len(sample_id_list) != len(aggregate_fn_list):
            sample_id_list = list(range(len(aggregate_fn_list)))

        for label, fn in zip(sample_id_list, aggregate_fn_list):
            fp = FileParser(fn=fn,
                            label=label,
                            dtypes={
                                "start": int,
                                "end": int,
                                "median_llr": float
                            },
                            verbose=verbose,
                            quiet=quiet,
                            include_byte_len=True)
            all_fp_len += len(fp)
            fp_list.append(fp)

            # Check colnames
            if not colnames:
                colnames = set(fp.colnames)
            elif colnames != set(fp.colnames):
                raise ValueError(f"Invalid field {fp.colnames} in file {fn}")

            # Get input file type
            if not input_type:
                input_type = fp.input_type
            elif input_type != fp.input_type:
                raise ValueError(f"Inconsistent input types")

        # Check that aggregate_fn_list contains valid input types
        if not input_type in ["CpG_Aggregate", "Interval_Aggregate"]:
            raise pycoMethError(
                "Invalid input file type passed (aggregate_fn_list). Expecting pycoMeth CpG_Aggregate or Interval_Aggregate output TSV files"
            )

        # Define StatsResults to collect valid sites and perform stats
        stats_results = StatsResults(pvalue_method=pvalue_method,
                                     pvalue_adj_method=pvalue_adj_method,
                                     pvalue_threshold=pvalue_threshold,
                                     min_diff_llr=min_diff_llr,
                                     min_samples=min_samples,
                                     input_type=input_type)

        log.info("Starting asynchronous file parsing")
        with tqdm(total=all_fp_len,
                  unit=" bytes",
                  unit_scale=True,
                  desc="\tProgress",
                  disable=not progress) as pbar:

            coord_d = defaultdict(list)

            # Read first line from each file
            log.debug("Reading first lines")
            for fp in fp_list:
                # Move pointer up and index by coordinate
                try:
                    line = fp.next()
                    coord = coordgen(line.chromosome, line.start, line.end)
                    coord_d[coord].append(fp)
                    pbar.update(line.byte_len)
                except StopIteration:
                    raise pycoMethError("Empty file found")

            # Continue reading lines from all files
            log.debug("Starting deep parsing")
            fp_done = 0
            while True:
                # Get lower coord if has enough samples
                lower_coord = sorted(coord_d.keys())[0]
                coord_fp_list = sorted(coord_d[lower_coord],
                                       key=lambda x: x.label)

                # Deal with lower coordinates and compute result if needed
                stats_results.compute_pvalue(
                    coord=lower_coord,
                    line_list=[
                        coord_fp.current() for coord_fp in coord_fp_list
                    ],
                    label_list=[coord_fp.label for coord_fp in coord_fp_list])

                # Remove lower entry and move fp to next sequence
                del (coord_d[lower_coord])
                for fp in coord_fp_list:

                    # Move pointer up and index by coordinate
                    try:
                        line = fp.next()
                        coord = coordgen(line.chromosome, line.start, line.end)
                        coord_d[coord].append(fp)
                        pbar.update(line.byte_len)
                    except StopIteration:
                        fp_done += 1

                # Exit condition = all file are finished
                if fp_done == len(fp_list):
                    break

        # Init file writter
        with Comp_Writer(bed_fn=output_bed_fn,
                         tsv_fn=output_tsv_fn,
                         input_type=input_type,
                         verbose=verbose) as writer:

            # Exit condition
            if not stats_results.res_list:
                log.info("No valid p-Value could be computed")

            else:
                # Convert results to dataframe and correct pvalues for multiple tests
                log.info("Adjust pvalues")
                stats_results.multitest_adjust()

                # Write output file
                log.info("Writing output file")
                for res in tqdm(stats_results.res_list,
                                unit=" sites",
                                unit_scale=True,
                                desc="\tProgress",
                                disable=not progress):
                    writer.write(res)

    finally:
        # Print counters
        log_dict(stats_results.counter, log.info, "Results summary")

        # Close input and output files
        for fp in fp_list:
            try:
                fp.close()
            except:
                pass

Example #4

Show file

def CpG_Aggregate(nanopolish_fn: [str],
                  ref_fasta_fn: str,
                  output_bed_fn: str = "",
                  output_tsv_fn: str = "",
                  min_depth: int = 10,
                  sample_id: str = "",
                  min_llr: float = 2,
                  verbose: bool = False,
                  quiet: bool = False,
                  progress: bool = False,
                  **kwargs):
    """
    Calculate methylation frequency at genomic CpG sites from the output of `nanopolish call-methylation`
    * nanopolish_fn
        Path to a nanopolish call_methylation tsv output file or a list of files or a regex matching several files (can be gzipped)
    * ref_fasta_fn
        Reference file used for alignment in Fasta format (ideally already indexed with samtools faidx)
    * output_bed_fn
        Path to write a summary result file in BED format (At least 1 output file is required) (can be gzipped)
    * output_tsv_fn
        Path to write a more extensive result report in TSV format (At least 1 output file is required) (can be gzipped)
    * min_depth
        Minimal number of reads covering a site to be reported
    * sample_id
        Sample ID to be used for the BED track header
    * min_llr
        Minimal log likelyhood ratio to consider a site significantly methylated or unmethylated in output BED file
    """

    # Init package
    opt_summary_dict = opt_summary(local_opt=locals())
    log = get_logger(name="pycoMeth_CpG_Aggregate",
                     verbose=verbose,
                     quiet=quiet)

    log.warning("Checking options and input files")
    log_dict(opt_summary_dict, log.debug, "Options summary")

    # At least one output file is required, otherwise it doesn't make any sense
    if not output_bed_fn and not output_tsv_fn:
        raise pycoMethError("At least 1 output file is requires (-t or -b)")

    # Init SitesIndex object with ref_fasta_fn to aggregate data at genomic position level
    log.warning("Parsing methylation_calls file")
    sites_index = SitesIndex(ref_fasta_fn=ref_fasta_fn)

    # Open file parser
    # Possible fields chromosome	strand	start	end	read_name	log_lik_ratio	log_lik_methylated	log_lik_unmethylated	num_calling_strands	num_motifs	sequence
    dtypes = {
        "start": int,
        "end": int,
        "log_lik_ratio": float,
        "num_motifs": int
    }
    with FileParser(fn=nanopolish_fn,
                    dtypes=dtypes,
                    verbose=verbose,
                    quiet=quiet,
                    include_byte_len=progress) as fp_in:

        if not fp_in.input_type == "call_methylation":
            raise pycoMethError(
                "Invalid input file type passed (nanopolish_fn). Expecting Nanopolish call_methylation output TSV file"
            )

        log.info("Starting to parse file Nanopolish methylation call file")
        with tqdm(total=len(fp_in),
                  unit=" bytes",
                  unit_scale=True,
                  desc="\tProgress",
                  disable=not progress) as pbar:
            for lt in fp_in:
                sites_index.add(lt)
                # Update progress_bar
                if progress: pbar.update(lt.byte_len)

        log_dict(fp_in.counter, log.info, "Parsing summary")

        log.info("Filtering out low coverage sites")
        sites_index.filter_low_count(min_depth)

        log.info("Sorting each chromosome by coordinates")
        sites_index.sort()

        log_dict(sites_index.counter, log.info, "Sites summary")

    log.warning("Processing valid sites found and write to file")

    with CpG_Writer(bed_fn=output_bed_fn,
                    tsv_fn=output_tsv_fn,
                    sample_id=sample_id,
                    min_llr=min_llr,
                    verbose=verbose) as fp_out:
        for coord, val_dict in tqdm(sites_index,
                                    unit=" sites",
                                    unit_scale=True,
                                    desc="\tProgress",
                                    disable=not progress):
            fp_out.write(coord, val_dict)

        log_dict(fp_out.counter, log.info, "Results summary")