Esempio n. 1
0
def compute_qvalues(pvalues: List[np.double], debug: bool) -> List[np.double]:
    """Corrects P-values with False Discovery Rate Benjamini-Hochberg procedure.

    ...

    Parameters
    ----------
    pvalues : list
        P-values
    debug : bool
        trace the full error stack

    Returns
    -------
    list
        corrected P-values (q-values)
    """

    if not isinstance(pvalues, list):
        errmsg = "Expected list, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(pvalues).__name__),
                          debug)

    print("\nComputing q-values...\n")
    # use Benjamini-Hochberg procedure to correct P-values
    mt_obj = multipletests(pvalues, method="fdr_bh")
    qvalues: List[float] = list(mt_obj[1])

    return qvalues
Esempio n. 2
0
def print_results(results: pd.DataFrame, debug: bool):
    """Print GRAFIMO results to stdout. It is printed the tab-separated result
    summary.

    ...

    Parameters
    ----------
    results : pandas.DataFrame
        analysis results
    debug : bool
        trace the full error stack
    """

    if not isinstance(results, pd.DataFrame):
        errmsg = "Expected pandas.DataFrame, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(results).__name__),
                          debug)

    # little hack in pd df parameters to avoid the weird default
    # print of a DataFrame (cut the majority of lines)
    pd.set_option("display.max_rows", len(results))
    print()  # newline
    print(results)
    pd.reset_option("display.max_rows")
Esempio n. 3
0
def average_bg_with_rc(bgs: Dict, debug: bool):
    """Background probabilities are averaged with those occurring on the reverse
    complement strand.

    Parameters
    ----------
    bgs : dict
        background probability distribution
    debug: bool
        trace full error stack

    Returns
    -------
    dict
        background probability distribution averaged for reverse 
        complement feequencies 
    """

    if not isinstance(bgs, dict):
        errmsg = "Expected dict, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(bgs).__name__), debug)

    bgs_avg: Dict = dict()
    for nuc in bgs.keys():
        rc: str = REV_COMPL[nuc.upper()]
        if REV_COMPL[rc] == nuc and ord(nuc) < ord(rc):
            avg_freq = np.double((bgs[nuc] + bgs[rc]) / np.double(2))
            bgs_avg.update({nuc: avg_freq})
            bgs_avg.update({rc: avg_freq})
    return bgs_avg
Esempio n. 4
0
def print_scoring_msg(motif: Motif, noreverse: bool, debug: bool):
    """Message printed when scoring procedure begins.

    ...

    Parameters
    ----------
    motif : Motif
        motif object
    noreverse : bool
        skip reverse strand sequences
    debug : bool
        trace the full error stack
    """

    if not isinstance(motif, Motif):
        errmsg = "Expected Motif, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(motif).__name__),
                          debug)
    if not isinstance(noreverse, bool):
        errmsg = "Expected bool, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(noreverse).__name__),
                          debug)

    fw_id: str = "".join(["+", motif.motifID])
    if not noreverse: rev_id: str = "".join(["-", motif.motifID])
    msg = "Scoring hits for motif {}."
    print(msg.format(fw_id))
    if not noreverse:
        print(msg.format(rev_id), end="\n\n")
Esempio n. 5
0
def get_regions_bed(bedfile: str, debug: bool) -> Tuple[Dict, int]:
    """Read BED file and store genomic regions in a dictionary with the 
    chromosome numbers as keys. This allows to optimize VG cache loading.

    ...

    Parameters
    ----------
    bedfile : str 
        path to BED file
    debug : bool
        trace the full error stack
        
    Returns
    -------
    dict
        genomic regions grouped by chromosome
    int 
        number of genomic regions
    """

    if not isinstance(bedfile, str):
        errmsg = "Expected str, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(bedfile).__name__), debug)
    if not os.path.isfile(bedfile):
        errmsg = "Unable to locate {}.\n"
        exception_handler(FileNotFoundError, errmsg.format(bedfile), debug)
    if not isbed(bedfile, debug):
        errmsg = "{} is not a UCSC BED file.\n"
        exception_handler(FileFormatError, errmsg.format(bedfile), debug)
    if os.stat(bedfile).st_size == 0:
        errmsg = "{} is empty.\n"
        exception_handler(FileReadError, errmsg.format(bedfile), debug)

    regions: Dict = dict()
    region_num: int = 0 
    gzipped = False 
    ff = bedfile.split(".")[-1]
    if ff == "gz": gzipped = True  # file is compressed
    try:
        if gzipped: ifstream = gzip.open(bedfile, mode="rt")
        else: ifstream = open(bedfile, mode="r")
        while True:
            line = ifstream.readline()
            if not line: break  # EOF or empty line?
            if line.startswith("chr"):  # data
                chrom, start, stop = line.strip().split()[:3]
                if chrom not in regions.keys():
                    regions.update({chrom:[(start, stop)]})
                else:
                    regions[chrom].append((start, stop))
                region_num += 1
    except:
        errmsg = "An error occurred while reading {}.\n"
        exception_handler(FileReadError, errmsg.format(bedfile), debug)
    finally:
        ifstream.close()
    
    return regions, region_num
Esempio n. 6
0
def indexVG(vg: str, vcf: str, threads: int, verbose: bool,
            debug: bool) -> int:
    """Construct the XG and GBWT indexes for the given genome variation 
    graph. These indexes are required to query the genome when extracting
    motif occurrence candidates.

    The GBWT index allows to keep track of the haplotypes used to build
    the graph data structure and retrieve the samples genomes. 
    
    The indexing operation could take some time.
        
    Parameters
    ----------
    vg : str
        path to the genome variation graph (VG format)
    vcf : str 
        path to the phased VCF file used to build the corresponding
        VG
    threads : int
        number of threads to use during indexing
    verbose : bool
        print information about graph indexing
        
    Returns
    -------
    int
        status of VG indexing (0 = all ok; 1 = an error occurred)
    """

    if not isinstance(vg, str):
        errmsg = "Expected str instance, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(vg).__name__), debug)
    if not os.path.exists(vg):
        errmsg = "Unable to find {}.\n"
        exception_handler(FileNotFoundError, errmsg, debug)
    success: int
    # take chromosome name and add it the XG extension
    graph_name: str = vg.split('.')[-2]
    xg: str = ''.join([graph_name, ".xg"])
    gbwt: str = ''.join([graph_name, ".gbwt"])
    # perform indexing of the current genome variation graph
    if verbose:
        # print information about indexing
        vg_index: str = 'vg index -t {0} -G {1} -v {2} -x {3} {4} -p'.format(
            threads, gbwt, vcf, xg, vg)
    else:
        vg_index = 'vg index -t {0} -G {1} -v {2} -x {3} {4}'.format(
            threads, gbwt, vcf, xg, vg)
    code: int = subprocess.call(vg_index, shell=True)
    if code != 0:
        success = 1
    else:
        success = 0

    return success
Esempio n. 7
0
def get_chromlist(ref_genome: str, debug: bool) -> List[str]:
    """Scan the reference genome FASTA file to find the chromosomes for
    which there a sequence is available.
    
    The file must be in FASTA format and the chromosome names start with
    '>chr' (e.g. '>chrX', '>chr1', etc.)
        
    Parameters
    ----------
    ref_genome : str
        path to the reference genome FASTA file
        
    Returns
    -------
    list
        chomosomes for which a sequence is available in the given 
        reference genome FASTA file 
    """

    assert os.path.isfile(ref_genome)
    # redefine default SIGINT handler
    original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
    # overwrite original SIGINT handler
    signal.signal(signal.SIGINT, original_sigint_handler)
    chroms = list()

    try:
        with open(ref_genome, mode='r') as ifstream:
            while True:
                line = ifstream.readline()
                if not line: return  # empty file ?
                if line[0] == ">": break  # data start here
            while True:
                if line[0] != ">":
                    errmsg = "Sequence names in FASTA file should begin with \">\"\n."
                    exception_handler(FileReadError, errmsg, debug)
                else:
                    seqname = line.rstrip().split()[0][1:]  # skip ">"
                line = ifstream.readline()
                while True:
                    if not line: break  # empty sequence ?
                    if line[0] == ">": break  # sequence end
                    line = ifstream.readline()
                chroms.append(seqname)
                if not line: break  # reached EOF
    except KeyboardInterrupt:
        sigint_handler()
    except:
        errmsg = "A problem was encountered reading {}\n."
        exception_handler(FileReadError, errmsg.format(ref_genome), debug)
    finally:
        ifstream.close()

    return chroms
Esempio n. 8
0
def get_kmers(
    queries: List[str], 
    pool: mp.Pool, 
    debug: bool,
    verbose: Optional[bool] = False,
) -> None:
    """Retrieve sequences from genome variation graph(s). The k-mers search is
    made in parallel creating #cores processes.

    ...

    Parameters
    ----------
    queries : list
        list of queries
    pool : multiprocessing.Pool
        pool ps
    debug : bool
        trace the full error stack
    verbose : bool, optional
        print additional information
    """

    if not isinstance(queries, list):
        errmsg = "Expected list, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(queries).__name__), debug)

    if verbose: start_re: float = time.time()
    try:
        res: mp.pool.MapResult = (pool.map_async(get_seqs, queries))
        if not verbose:
            it: int = 0
            while (True):
                if res.ready():
                    printProgressBar(
                        tot, tot, prefix='Progress:', suffix='Complete', length=50
                    )
                    break
                if it == 0: tot = res._number_left
                remaining = res._number_left
                printProgressBar(
                    (tot - remaining), tot, prefix='Progress:', suffix='Complete', length=50
                )
                time.sleep(1)
                it += 1
        ret: list = res.get(60 * 60 * 60)  # does not ignore signals
    except KeyboardInterrupt:
        pool.terminate()
        sigint_handler()
    else:
        pool.close()
        if verbose:
            end_re: float = time.time()
            print("Extracted sequences from all regions in %.2fs" % (end_re - start_re))
Esempio n. 9
0
def norm_motif(motif_probs: pd.DataFrame, motif_width: int,
               alphabet: List[str], debug: bool) -> pd.DataFrame:
    """Normalize motif PWM. The PWM values must be given as probability (so called
    PFM), rather than simple raw counts.

    Parameters
    ----------
    motif_probs : pandas.DataFrame
        motif probability matrix (PFM)
    motif_width : int
        motif width
    alphabet : list
        DNA motif alphabet
    debug: bool
        trace the full error stack

    Returns
    -------
    pandas.DataFrame
        normalized motif probability matrix (nPFM)
    """

    if not isinstance(motif_probs, pd.DataFrame):
        errmsg = "Expected pandas.DataFrame, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(motif_probs).__name__),
                          debug)
    if not isinstance(motif_width, int):
        errmsg = "Expected int, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(motif_width).__name__),
                          debug)
    if motif_width <= 0:
        errmsg = "Forbidden motif width.\n"
        exception_handler(ValueError, errmsg, debug)
    if not isinstance(alphabet, list):
        errmsg = "Expected list, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(alphabet).__name__),
                          debug)
    if any([nuc not in DNA_ALPHABET for nuc in alphabet]):
        errmsg = "The motif is not built on DNA alphabet.\n"
        exception_handler(ValueError, errmsg, debug)

    # tolerance in the difference between the position probability and 1
    tolerance: float = 0.00001
    for j in range(motif_width):
        tot = np.double(0)
        for nuc in alphabet:
            tot += motif_probs.loc[nuc, j]
        assert tot != 0
        if not almost_equal(1, tot, tolerance):
            for nuc in alphabet:
                motif_probs.loc[nuc,
                                j] = np.double(motif_probs.loc[nuc, j] / tot)

    return motif_probs
Esempio n. 10
0
def buildvg(args_obj: BuildVG, debug: bool) -> None:
    """Call the functions needed to constuct the genome variation graph 
    from a reference FASTA file and a phased VCF file.

    Parameters
    ----------
    args_obj : BuildVG
        container of the argumentgs needed to build a genome variation
        graph

    """

    if not isinstance(args_obj, BuildVG):
        errmsg = "Expected BuildVG object, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(args_obj).__name__),
                          debug)
    printWelcomeMsg()
    # if verbose == True print a lot of info
    verbose = args_obj.verbose
    print("\n\nBuilding the VG for chromosome:")
    for c in args_obj.chroms:
        print(c, end=" ")
    print("\n")  # newline

    if verbose:
        print("Buildvg user parameters:")
        print("\t- Reference genome: ", args_obj.reference_genome)
        print("\t- VCF file: ", args_obj.vcf)
        print("\t- Reindex: ", args_obj.reindex)
        print("\t- Chromosomes: ", args_obj.chroms)
        print("\t- Chromosome prefix: ", args_obj.chroms_prefix)
        print("\t- Name-map: ", args_obj.namemap)
        print("\t- Cores: ", args_obj.cores)
        print("\t- Output directory: ", args_obj.outdir)
        print("\t- Debug:", debug)
        print("\t- Verbose: ", verbose)
        print("\t- Test mode: ", args_obj.get_test())
    # end if

    if verbose:
        print("\nBeginning VGs construction\n")

    # begin VGs construction
    construct_vg(
        args_obj,
        debug)  # the VGs will be stored in the defined output directory
Esempio n. 11
0
def process_motif_for_logodds(motif: Motif, debug: bool) -> Motif:
    """Computes log-odds values from motif probability matrix (PFM).

    While processing  motif probability matrix for log-odds values is also
    computed the p-value matrix for the current motif PWM. 

    ...

    Parameters
    ----------
    motif : Motif
        DNA motif 
    debug : bool
        trace the full error stack
        
    Returns
    -------
    Motif
        motif log-odds matrix
    """

    if not isinstance(motif, Motif):
        errmsg = "Expected Motif, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(motif).__name__),
                          debug)

    # compute log-odds
    motif_log_odds = compute_log_odds(motif.countMatrix, motif.width, motif.bg,
                                      motif.alphabet, motif.nucsmap, debug)
    motif.set_motifScoreMatrix(motif_log_odds)

    # log-odds matrix scaling
    scaled_scores, min_val, max_val, scale, offset = scale_pwm(
        motif.scoreMatrix, motif.alphabet, motif.width, motif.nucsmap, debug)
    motif.set_motifScoreMatrix(scaled_scores)
    motif.set_isScaled()
    motif.set_scale(scale)
    motif.set_minVal(min_val)
    motif.set_maxVal(max_val)
    motif.set_offset(offset)

    # compute p-value matrix
    pval_mat = comp_pval_mat(motif, debug)
    motif.set_motifPvalMatrix(pval_mat)

    return motif
Esempio n. 12
0
def __read_counts_meme(motif_file: str, ifstream, width: int,
                       debug: bool) -> List[List[np.double]]:
    """Read motif letter probabilities from MEME files.

    ...

    Parameters
    ----------
    motif_file : str
        path to motif PWM
    ifstream : _io.TextIOWrapper
        input stream
    width : int
        motif width
    debug:
        trace the full error stack

    Returns 
    -------
    list
        motif letter probabilities
    """

    a = list()
    c = list()
    g = list()
    t = list()
    pos = 0
    for line in ifstream:
        freqs = line.split()
        if len(freqs) != 4:
            if pos < width:
                errmsg = "Unexpected end of motif found.\n"
                exception_handler(EOFError, errmsg, debug)
            break  # motif stop
        a.append(np.double(freqs[0]))
        c.append(np.double(freqs[1]))
        g.append(np.double(freqs[2]))
        t.append(np.double(freqs[3]))
        pos += 1
    probs = [a, c, g, t]
    if any([len(p) != len(probs[0]) for p in probs]):
        errmsg = "Mismatch in letter probabilities vectors lengths.\n"
        exception_handler(ValueError, errmsg, debug)
    return probs
Esempio n. 13
0
def pseudo_bg(bgs: Dict, no_reverse: bool, debug: bool) -> Dict:
    """Add pseudocount and normalize the nucleotides background probabilities. 
    The processed background probabilities are then used to compute the scoring
    matrix from the input motif PWM.

    When considered both forward and reverse strand, the background probabilities
    are weighted and averaged on both strands. 

    After the weighting and averaging steps (if required), the background 
    probabilities are normalized. 

    ...

    Parameters
    ----------
    bgs : dict
        background probability distribution
    no_reverse : bool
        if True the averaging and weighting operation on bg probabilities on 
        both DNA strands are skipped (only fwd strand considered). 
    debug: bool
        trace the full error stack

    Returns 
    -------
    dict
        normalized background probability distribution
    """

    if not isinstance(bgs, dict):
        errmsg = "Expected dict, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(bgs).__name__), debug)
    if not isinstance(no_reverse, bool):
        errmsg = "Expected bool, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(no_reverse).__name__),
                          debug)

    if not no_reverse:  # fwd + rev strand
        bgs_avg = average_bg_with_rc(bgs, debug)
    else:  # only fwd
        bgs_avg = bgs
    bgs_proc = norm_bg(bgs_avg, debug)
    return bgs_proc
Esempio n. 14
0
def get_reference_genome_from_ucsc(debug) -> str:
    """Download the reference genome (hg38 assembly), from the UCSC
    database, in the current working directory and returns the path to 
    the corresponding FASTA file.

    This function has been written only for test purposes
    
    Parameters
    ----------

    Returns
    -------
    str
        path to the downloaded FASTA file (in .fa format)
    """

    cmd: str
    code: int
    errmsg: str

    # download genome
    address = "ftp://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz"
    cmd = "wget -c {}".format(address)
    # the genome will be downloaded in the current directory
    code = subprocess.call(cmd, shell=True)
    if code != 0:
        errmsg = "An error occurred while executing \"{}\". Exiting.\n"
        exception_handler(SubprocessError, errmsg.format(cmd), debug)
    # uncompress genome
    print("Uncompressing the genome...")
    genome_comp: str = './hg38.fa.gz'
    if not os.path.exists(genome_comp):
        errmsg = "Unable to find {}.\n"
        exception_handler(FileNotFoundError, errmsg.format(genome_comp), debug)
    cmd = 'gunzip {0}'.format(genome_comp)
    code = subprocess.call(cmd, shell=True)
    if code != 0:
        errmsg = "An error occurred while executing \"{}\". Exiting.\n"
        exception_handler(SubprocessError, errmsg.format(cmd), debug)
    # remove FASTA.GZ file if still present
    if os.path.exists(genome_comp):
        cmd = 'rm {0}'.format(genome_comp)
        code = subprocess.call(cmd, shell=True)
        if code != 0:
            errmsg = "An error occurred while executing \"{}\". Exiting.\n"
            exception_handler(SubprocessError, errmsg.format(cmd), debug)
    # get the path to the genome file
    genome_uncomp: str = "./hg38.fa"  # should be in the current dir
    assert os.path.exists(genome_uncomp)
    genome: str = os.path.abspath(genome_uncomp)

    return genome
Esempio n. 15
0
def __read_alphabet_meme(motif_file: str, ifstream, debug: bool) -> List[str]:
    """Read alphabet from MEME files.
    
    ...

    Parameters
    ----------
    motif_file : str
        path to motif PWM
    ifstream : _io.TextIOWrapper
        input stream
    debug : bool
        trace the full error stack

    Returns
    -------
    list
        alphabet
    """

    for line in ifstream:
        if line.startswith("ALPHABET"): break
    else:
        errmsg = "Unexpected EOF reached, unable to parse {}.\n"
        exception_handler(EOFError, errmsg.format(motif_file), debug)
    if not line.startswith("ALPHABET"):
        errmsg = "No line stores alphabet in {}.\n"
        exception_handler(ValueError, errmsg.format(motif_file), debug)
    line = line.strip().replace("ALPHABET= ", "")
    if line == "ACGT": alphabet = sorted(list(line))
    else:
        errmsg = "The motif is not built on DNA alphabet.\n"
        exception_handler(ValueError, errmsg, debug)
    assert isListEqual(alphabet, DNA_ALPHABET)
    return alphabet
Esempio n. 16
0
def isVGindexed(vg: str, debug: bool) -> bool:
    """Check if the genome variation graph has been indexed (XG format).

    ...

    Parameters
    ----------
    vg : str 
        path to genome variation graph
    debug : bool
        trace the full error stack
        
    Returns
    -------
    bool
        check result
    """

    if not isinstance(vg, str):
        errmsg = "Expected str, got{}.\n"
        exception_handler(TypeError, errmsg.format(type(vg).__name__), debug)
    if not os.path.isfile(vg):
        errmsg = "Unable to locate {}.\n"
        exception_handler(FileNotFoundError, errmsg.format(vg), debug)

    ff = vg.split(".")[-1]
    if ff == "xg": return True
    elif ff == "vg": return False
    else:  # unknown genome variation graph format
        errmsg = "Unknown genome variation graph format (VG or XG allowed).\n"
        exception_handler(VGError, errmsg, debug)
Esempio n. 17
0
def get_1000GProject_vcf(debug) -> str:
    """Downloads a WGS VCF file from the 1000 Genome Project database
    (phase 3), containing SNVs and indels. The present file is used for 
    VG construction and graph indexing test purposes. 
    
    Since the variants present in this file are not phased, it cannot be 
    used to build the GBWT index and the corresponding haplotypes cannot
    be used. To use this features the VCF must be phased first.

    Parameters
    ----------
    
    Returns
    -------
    str
        path to the downloaded VCF file (compressed)
    """

    address: str
    cmd: str
    code: int
    errmsg: str

    # download the VCF
    address = 'ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/'
    address += '1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/'
    address += 'ALL.wgs.shapeit2_integrated_snvindels_v2a.GRCh38.27022019.sites.vcf.gz'
    cmd = 'wget -c {0}'.format(address)
    code = subprocess.call(cmd, shell=True)
    if code != 0:
        errmsg = "An error occurred while executing \"{}\". Exiting.\n"
        exception_handler(SubprocessError, errmsg.format(cmd), debug)
    # vcf should be in the current dir
    vcf_file: str = './ALL.wgs.shapeit2_integrated_snvindels_v2a.GRCh38.27022019.sites.vcf.gz'
    assert os.path.exists(vcf_file)
    vcf: str = os.path.abspath(vcf_file)

    return vcf
Esempio n. 18
0
def norm_bg(bgs: Dict, debug: bool):
    """Normalize the background probability distribution.

    Parameters
    ----------
    bgs : dict
        background probability distribution
    debug: bool
        trace the full error stack
    
    Returns
    -------
    dict
        normalized background probability distribution
    """

    if not isinstance(bgs, dict):
        errmsg = "Expected dict, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(bgs).__name__), debug)

    alphabet: List[str] = sorted(list(bgs.keys()))
    tot = np.double(len(alphabet) * PSEUDObg)
    bgs_norm = dict()
    # PSEUDO = np.double(0.0000005)

    for nuc in bgs.keys():
        tot += np.double(bgs[nuc])
    assert tot > 0
    for nuc in bgs.keys():
        prob = np.double((bgs[nuc] + PSEUDObg) / tot)
        bgs_norm.update({nuc: prob})
    tot = np.double(0)
    for nuc in bgs.keys():
        tot += bgs[nuc]
    assert tot != 0
    return bgs_norm
Esempio n. 19
0
def build_motif_JASPAR(motif_file: str, bg_file: str, pseudocount: float,
                       no_reverse: bool, verbose: bool, debug: bool) -> Motif:
    """Build the Motif object from a JASPAR motif Position Weight
    Matrix.

    It is computed the scoring matrix from the values given with the PWM
    and the P-value matrix to assign a statistical significance to
    each motif occurrence candidate, based on the resulting log-odds
    score.

    ...

    Parameters
    ----------
    motif_file : str
        path to the motif PWM
    bg_file
        path to the background file in Markov Background Format
        (http://meme-suite.org/doc/bfile-format.html).
    pseudocount : float
        value to add to motif PWM counts
    no_reverse : bool
        if False only the forward strand will be considered, otherwise
        both forward and reverse are considered
    verbose : bool
        print additional information
    debug : bool
        trace the full error stack

    Returns
    -------
    Motif
        processed motif object
    """

    if not isinstance(motif_file, str):
        errmsg = "Expected str, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(motif_file).__name__),
                          debug)
    if not os.path.isfile(motif_file):
        errmsg = "Unable to locate {}.\n"
        exception_handler(FileNotFoundError, errmsg.format(motif_file), debug)
    if not isJaspar_ff(motif_file, debug):
        errmsg = "Required JASPAR motif PWM parsing, but {} is not in JASPAR format.\n"
        exception_handler(MotifFileFormatError, errmsg.format(motif_file),
                          debug)
    if not isinstance(bg_file, str):
        errmsg = "Expected str, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(bg_file).__name__),
                          debug)
    if bg_file != UNIF and not os.path.isfile(bg_file):
        errmsg = "Unable to locate {}.\n"
        exception_handler(FileNotFoundError, errmsg.format(bg_file), debug)
    if pseudocount <= 0:
        errmsg = "Pseudocount value must be positive.\n"
        exception_handler(ValueError, pseudocount, debug)
    if not isinstance(no_reverse, bool):
        errmsg = "Expected bool, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(no_reverse).__name__),
                          debug)

    # parse motif PWM
    motif: Motif = read_JASPAR_motif(motif_file, bg_file, pseudocount,
                                     no_reverse, verbose, debug)
    if verbose: start_mp: float = time.time()
    motif = process_motif_for_logodds(motif,
                                      debug)  #  get log-odds values for motif
    if verbose:
        end_mp: float = time.time()
        print("Motif %s processed in %.2fs" % (motif.motifID,
                                               (end_mp - start_mp)))

    return motif
Esempio n. 20
0
def construct_vg(buildvg_args: BuildVG, debug: bool) -> None:
    """ Create the genome graph from the given genome reference and 
    phased VCF file given.
    
    The genome is not built as a single whole genome graph but a
    single graph is constructed for each chromosome.
    This approach avoids memory issues and allows the genome variation
    graph construction also on machines with less resources.

    There is NO drawback using this approach with respect to build
    a whole genome graph and query it.
    
    Moreover, it allows parallel queries on the different chromosomes to
    be perfromed also on regular laptops (>= 16 GB of memory), which is 
    very difficult with a whole genome graph, that requires the user
    to set appropriately the number of cores to use. Anyway a whole
    genome graph can be queried using a regular laptop using one core.
    
    Parameters
    ----------
        buildvg_args : BuildVG
            container for the arguments required to build the genome 
            variation graph 
    """
    errmsg: str
    if not isinstance(buildvg_args, BuildVG):
        errmsg = "Expectd BuildVG object, got {}.\n"
        exception_handler(TypeError,
                          errmsg.format(type(buildvg_args).__name__), debug)

    # read the arguments to build the VGs
    reindex: bool = buildvg_args.reindex
    chroms: List[str] = buildvg_args.chroms
    chroms_prefix: str = buildvg_args.chroms_prefix
    namemap: Dict = buildvg_args.namemap
    threads: int = buildvg_args.cores
    outdir: str = buildvg_args.outdir
    verbose: bool = buildvg_args.verbose
    test: bool = buildvg_args.get_test()  # manually set in the code
    msg: str
    reference: str
    vcf: str

    if test:
        reference = get_reference_genome_from_ucsc(debug)
        vcf = get_1000GProject_vcf(debug)
    else:
        reference = buildvg_args.reference_genome
        vcf = buildvg_args.vcf

    if verbose:
        print("using reference genome: ", reference)
        print("Using VCF file: ", vcf, "\n\n")

    if verbose:
        start_c: float = time.time()
        print("Reading chromosome names from {}...".format(reference))
    # read chromosome names in reference FASTA
    chroms_available: List[str] = get_chromlist(reference, debug)
    if verbose:
        end_c: int = time.time()
        print("done in %.2fs" % (end_c - start_c))
        print("Found chromosomes:\n", chroms_available, end="\n\n")
    if len(chroms) == 1 and chroms[0] == ALL_CHROMS:
        chroms: List[str] = chroms_available
    else:
        # check user-defined chromosome names consistency with names in
        # reference
        for c in chroms:
            if c not in chroms_available:
                errmsg = "Chromosome \"{}\" not found among names in {}.\n"
                exception_handler(ValueError, errmsg.format(c, reference),
                                  debug)

    cwd: str = os.getcwd()
    cmd: str
    code: int
    # check if the VCF file has already been indexed with tabix
    if not tbiexist(vcf):
        msg = "TBI file not found for {}. Indexing VCF file with tabix..."
        print(msg.format(vcf.split('/')[-1]))
        cmd = 'tabix -p vcf {0}'.format(vcf)
        code = subprocess.call(cmd, shell=True)
        if code != 0:  # tabix didn't work
            errmsg = "An error occurred while executing \"{}\". Exiting.\n"
            exception_handler(SubprocessError, errmsg.format(cmd), debug)
    elif reindex:  # user asked to reindex VCF
        msg = "Reindexing {}...\n"
        print(msg.format(vcf.split('/')[-1]))
        # remove old index
        cmd = "rm {0}".format(''.join([vcf, ".tbi"]))
        code = subprocess.call(cmd, shell=True)
        if code != 0:
            errmsg = "An error occurred while executing \"{}\". Exiting.\n"
            exception_handler(SubprocessError, errmsg.format(cmd), debug)
        # reindex the VCF
        cmd = "tabix -p vcf {0}".format(vcf)
        code = subprocess.call(cmd, shell=True)
        if code != 0:
            # tabix didn't work
            errmsg = "An error occurred while executing \"{}\". Exiting.\n"
            exception_handler(SubprocessError, errmsg.format(cmd), debug)
    # end if

    # enter the output directory
    os.chdir(outdir)
    if chroms_prefix: assert not bool(namemap)
    if bool(namemap): assert chroms_prefix != "chr"
    # build the VG for each chromosome or only for those told by user
    for chrname in chroms:
        if not bool(namemap):
            chrom: str = "".join([chroms_prefix, chrname])
        elif bool(namemap):
            try:
                chrom: str = namemap[chrname]
            except:
                errmsg = "Missing out name map for chromosome \"{}\".\n'"
                exception_handler(KeyError, errmsg.format(chrname), debug)
        vg: str = ''.join([".", chrom, '.vg'])
        # build VG for current chromosome
        if verbose:
            start_build: float = time.time()
        code = build_vg(vg, reference, vcf, chrname, threads)
        if code != 0:
            errmsg = "An error occurred during construction of {}.\n"
            exception_handler(VGError, errmsg.format(vg), debug)
        if verbose:
            end_build: float = time.time()
            msg = "Elapsed time to build {}:"
            print(msg.format(vg), "%.2fs" % (end_build - start_build), sep=" ")
        # index VG
        if verbose:
            start_index: float = time.time()
        msg = "Indexing {} VG and building the GBWT index..."
        print(msg.format(chrom))
        code = indexVG(vg, vcf, threads, verbose, debug)
        if code != 0:
            errmsg = "An error occurred while indexing {}."
            exception_handler(VGError, errmsg.format(vg), debug)
        if verbose:
            end_index: float = time.time()
            msg = "Elapsed time to index {}"
            print(msg.format(vg), "%.2fs" % (end_index - start_index), sep=" ")
        # end if

        # The majority of applications work only with indexed graph,
        # so to save disk space is worth to delete the VGs and keep
        # only the XGs (is simple to go back using VG built-in functions)
        if verbose:
            print("Deleting {0}".format(vg))
        cmd = 'rm {0}'.format(vg)
        subprocess.call(cmd, shell=True)
        if code != 0:
            errmsg = "An error occurred while executing \"{}\". Exiting.\n"
            exception_handler(SubprocessError, errmsg.format(cmd), debug)
    # end for

    # get VGs location
    graphs_loc: str = os.getcwd()
    # return to the original working directory
    os.chdir(cwd)
Esempio n. 21
0
def writeGFF3(prefix: str, data: pd.DataFrame, no_qvalue: bool,
              debug: bool) -> None:
    """Write GFF3 file (https://www.ensembl.org/info/website/upload/gff3.html). 
    
    The GFF3 file annotates the potential motf occurrences found by GRAFIMO. The
    report can be loaded as custom track to the UCSC genome browser for results
    visualization.

    ...
        
    Parameters
    ----------
    prefix : str
        out filename prefix
    data : pandas.DataFrame
        analysis results
    no_qvalue : bool
        ignore q-values
    debug : bool
        trace the full error stack
    """

    if not isinstance(prefix, str):
        errmsg = "Expected str, got {}.\n"
        exception_handler(TypeError, errmsg.format_map(type(prefix).__name__),
                          debug)
    if not isinstance(data, pd.DataFrame):
        errmsg = "Expected pandas.DataFrame, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(data).__name__), debug)
    if not isinstance(no_qvalue, bool):
        errmsg = "Expected bool, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(no_qvalue).__name__),
                          debug)

    data_list = dftolist(data, no_qvalue, debug)
    try:
        gfffn = ".".join([prefix, "gff"])
        ofstream = open(gfffn, mode='w+')
        header = "##gff-version 3\n"
        ofstream.write(header)
        if not no_qvalue and len(data_list) != 12:
            errmsg = "Q-values columns seems to be missing.\n"
            exception_handler(ValueError, errmsg, debug)
        if no_qvalue and len(data_list) != 11:
            errmsg = "Too many or too few columns.\n"
            exception_handler(ValueError, errmsg, debug)
        data_list_size: int = len(data_list[0])
        for i in range(data_list_size):
            seqname: str = data_list[2][i]
            chrom: str = seqname.split(":")[0]  # take only chromosome name
            score: float = round(data_list[6][i], 1)
            strand: str = data_list[5][i]
            if strand == "-":  # keep forward strand coordinates
                start = str(data_list[4][i])
                stop = str(data_list[3][i])
            else:
                start = str(data_list[3][i])
                stop = str(data_list[4][i])
            motifID: str = data_list[0][i]
            motifName: str = data_list[1][i]
            pvalue: float = np.format_float_scientific(data_list[7][i],
                                                       exp_digits=2)
            sequence: str = data_list[8][i]
            reference: str = data_list[10][i]
            if not no_qvalue:
                qvalue: float = np.format_float_scientific(data_list[11][i],
                                                           exp_digits=2)
            # gff line attributes
            att1: str = "".join(
                ["Name=", motifID, "_", seqname, strand, ":", reference])
            att2: str = "".join(["Alias=", motifName])
            att3: str = "".join(["ID=", motifID, "-", motifName, "-", seqname])
            att4: str = "".join(["pvalue=", str(pvalue)])
            att5: str = "".join(["sequence=", sequence,
                                 ";\n"])  # end of gff line
            if not no_qvalue:
                attqv: str = "".join(["qvalue=", str(qvalue)])
                atts = ";".join([att1, att2, att3, att4, attqv, att5])
            else:
                atts = ";".join([att1, att2, att3, att4, att5])
            # full gff line
            gffline: str = "\t".join([
                chrom, SOURCE, TP, start, stop,
                str(score), strand, PHASE, atts
            ])
            ofstream.write(gffline)
    except:
        errmsg = "An error ocurred while writing {}.\n"
        exception_handler(FileWriteError, errmsg.format(gfffn), debug)
    finally:
        ofstream.close()
Esempio n. 22
0
def scale_pwm(motif_matrix: np.ndarray, alphabet: List[str], motif_width: int,
              nucsmap: dict,
              debug: bool) -> Tuple[np.ndarray, int, int, int, np.double]:
    """Scale the motif log-odds matrix scores to integer values.

    The values are scaled in the range [0, 1000]. The scaling improves
    computational speed while scoring potential motif occurrences, and allows
    constant time p-value estimatimation.

    ...
        
    Parameters
    ----------
    motif_matrix : numpy.ndarray
        motif log-odds matrix
    alphabet: list
        DNA motif alphabet
    motif_width: int
        motif width
    nucsmap: dict
        nucleotide index map
    debug : bool
        trace the full error stack

    Returns
    -------
    numpy.ndarray
        scaled motif score matrix
    int
        minimum value of the scaled score matrix
    int
        maximum value of the scaled score matrix
    int
        scaling factor
    numpy.double
        scaling offset
    """

    if not isinstance(motif_matrix, np.ndarray):
        errmsg = "Expected numpy.ndarray, got {}.\n"
        exception_handler(TypeError,
                          errmsg.format(type(motif_matrix).__name__), debug)
    if motif_matrix.size == 0 or sum(sum(motif_matrix)) == 0:
        errmsg = "The motif log-odds natrix is empty.\n"
        exception_handler(ValueError, errmsg, debug)
    if not isinstance(alphabet, list):
        errmsg = "Expected list, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(alphabet).__name__),
                          debug)
    if not isListEqual(alphabet, DNA_ALPHABET):
        errmsg = "The motif is not built on DNA alphabet.\n"
        exception_handler(ValueError, errmsg, debug)
    if not isinstance(motif_width, int):
        errmsg = "Expected int, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(motif_width).__name__),
                          debug)
    if motif_width <= 0:
        errmsg = "Forbidden motif width.\n"
        exception_handler(ValueError, errmsg, debug)
    if not isinstance(nucsmap, dict):
        errmsg = "Expected dict, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(nucsmap).__name__),
                          debug)

    min_val = motif_matrix.min()
    max_val = motif_matrix.max()
    motif_matrixsc = np.zeros(motif_matrix.shape, dtype=np.double)

    lower: int = min_val
    upper: int = max_val
    if lower == upper:  # all values are equal
        lower = np.double(upper - 1)
    lower = np.floor(lower)
    offset = np.round(np.floor(lower))
    scale_factor = np.floor(RANGE / (upper - lower))
    # values scaled in [0, 1000]
    for nuc in alphabet:
        for j in range(motif_width):
            scaled_score = np.round(
                (motif_matrix[nucsmap[nuc], j] - (offset)) * scale_factor)
            motif_matrixsc[nucsmap[nuc], j] = scaled_score
    # make sure the values are integers
    motif_matrixsc = motif_matrixsc.astype(int)
    min_val = int(motif_matrixsc.min())  # scaled min
    max_val = int(motif_matrixsc.max())  # scaled max

    return motif_matrixsc, min_val, max_val, int(scale_factor), offset
Esempio n. 23
0
def get_motif_pwm(motif_file: str, args_obj: Findmotif, cores: int,
                  debug: bool) -> List[Motif]:
    """Construction of Motif object from PWM file.

    The motif PWM is processed in order to obtain the corresponding scoring
    matrix (values scaled in [0,1000]) and the corresponding P-value matrix,
    which is used to assign statistical significance to motif occurrence
    candidates scores.

    To store all these informations is created a Motif object.

    ...

    Parameters
    ----------
    motif_file : str
        path to motif PWM file (MEME or JASPAR format)
    args_obj : Findmotif
        arguments container
    cores : int
        CPU cores to use during motif processing (used only when
        processing MEME motif files with multiple PWMs)
    debug : bool
        trace the full error stack
    
    Returns
    -------
    List[Motif]
        Motif objects
    """

    bgs: dict = args_obj.bgfile
    pseudo: float = args_obj.pseudo
    no_reverse: bool = args_obj.noreverse
    verbose: bool = args_obj.verbose
    errmsg: str
    if not isinstance(motif_file, str):
        errmsg = "Expected str, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(motif_file).__name__),
                          debug)
    if not os.path.isfile(motif_file):
        errmsg = "Unable to locate {}.\n"
        exception_handler(FileNotFoundError, errmsg.format(motif_file), debug)
    if (not isMEME_ff(motif_file, debug)) and (not isJaspar_ff(
            motif_file, debug)):
        errmsg = "Motif PWM must be in MEME or JASPAR format.\n"
        exception_handler(MotifFileFormatError, errmsg, debug)

    # chhose motif PWM parsing method
    if isJaspar_ff(motif_file, debug):
        motif = build_motif_JASPAR(motif_file, bgs, pseudo, no_reverse,
                                   verbose, debug)
    elif isMEME_ff(motif_file, debug):
        motif = build_motif_MEME(motif_file, bgs, pseudo, no_reverse, cores,
                                 verbose, debug)
    else:
        errmsg = "Motif PWM must be in MEME or JASPAR format.\n"
        exception_handler(MotifFileFormatError, errmsg, debug)

    # list instance required to proceed
    if not isinstance(motif, list): motif = [motif]
    assert isinstance(motif, list)
    return motif
Esempio n. 24
0
def write_results(results: pd.DataFrame, motif: Motif, motif_num: int,
                  args_obj: Findmotif, debug: bool) -> None:
    """Write GRAFIMO results in three files (TSV report, HTML report, GFF3 file).

    The TSV and HTML reports stores the found potential motif occurrence in
    tabular format

    The GFF3 report stores annotations for the found motif occurrence candidates.

    ...

    Parameters
    ----------
    results : pandas.DataFrame
        analysis results
    motif_id : Motif
        motif
    motif_num : int
        number of searched motifs
    args_obj : Findmotif
        commandline arguments container 
    debug : bool
        trace the full error stack
    """

    if not isinstance(results, pd.DataFrame):
        errmsg = "Expected pandas.DataFrame, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(results).__name__),
                          debug)
    if len(results) == 0:
        errmsg = "No potential motif occurrence retreived.\n"
        exception_handler(ValueError, errmsg, debug)
    if not isinstance(motif, Motif):
        errmsg = "Expected Motif, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(motif).__name__),
                          debug)
    if not isinstance(motif_num, int):
        errmsg = "Expected int, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(motif_num).__name__),
                          debug)
    if motif_num <= 0:
        errmsg = "No motif searched. Probably something went wrong.\n"
        exception_handler(ValueError, errmsg, debug)
    if not isinstance(args_obj, Findmotif):
        errmsg = "Expected Findmotif, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(args_obj).__name__),
                          debug)

    # get resuls storing arguments
    outdir: str = args_obj.outdir
    no_qvalue: bool = args_obj.noqvalue
    top_graphs: int = args_obj.top_graphs
    verbose: bool = args_obj.verbose
    if args_obj.has_graphgenome(): vg = args_obj.graph_genome
    elif args_obj.has_graphgenome_dir: vg = args_obj.graph_genome_dir
    else:
        errmsg = "No genome variation graph given.\n"
        exception_handler(VGError, errmsg, debug)
    dirname_default: bool = False
    cwd: str = os.getcwd()
    if outdir == DEFAULT_OUTDIR:
        # to make unique the output directory we add the PID
        # to the name.
        #
        # This is useful when calling grafimo in different runs on the
        # same machine.
        outdir = "_".join(["grafimo_out", str(os.getpid()), motif.motifID])
        dirname_default = True
    if not os.path.isdir(outdir):
        cmd = "mkdir -p {}".format(outdir)
        code = subprocess.call(cmd, shell=True)
        if code != 0:
            errmsg = "An error occurred while executing {}.\n"
            exception_handler(SubprocessError, errmsg.format(cmd), debug)
        os.chdir(outdir)
    else:
        os.chdir(outdir)  # overwrite the content of the directory
    print("\nWriting results in %s.\n" % outdir)
    if not dirname_default and motif_num > 1:
        prefix = "_".join(["grafimo_out", motif.motifID
                           ])  # each file is labeled with the motif ID
    else:
        prefix = "grafimo_out"
    if verbose:
        start_tsv: float = time.time()
    # write the TSV
    results.to_csv(".".join([prefix, "tsv"]), sep='\t', encoding='utf-8')
    if verbose:
        end_tsv: float = time.time()
        print("%s.tsv written in %.2fs" % (prefix, (end_tsv - start_tsv)))
        start_html: float = time.time()
    # write the HTML
    results.to_html(".".join([prefix, "html"]))
    if verbose:
        end_html: float = time.time()
        print("%s.html written in %.2fs" % (prefix, (end_html - start_html)))
        start_gff: float = time.time()
    # write the GFF3
    writeGFF3(prefix, results, no_qvalue, debug)
    if verbose:
        end_gff: float = time.time()
        print("%s.gff written in %.2fs" % (prefix, (end_gff - start_gff)))
    # get the graphs of the top n regions
    if top_graphs > 0:
        regions = set()
        for r in results["sequence_name"].tolist():
            if len(regions) >= top_graphs: break  # abort loop
            regions.add(r)  # avoid repeated regions
        # regions = set(results["sequence_name"].tolist()[:top_graphs])
        if len(regions) == 0:
            errmsg = "No region obtained, the results seems to be empty.\n"
            exception_handler(ValueError, errmsg, debug)
        if len(regions) < top_graphs:
            warnmsg = "WARNING: requested %d regions, obtained %d.\n"
            print(warnmsg % (top_graphs, len(regions)))
        if verbose:
            print("Extracting %d region variation graphs" % len(regions))
        # create the directory for the regions images
        if motif_num > 1:
            image_dir = "_".join(["top_graphs", motif.motifID])
        else:
            image_dir = "top_graphs"
        if verbose: print("Graphs will be stored in %s." % image_dir)
        cmd = "mkdir -p {0}".format(image_dir)
        code = subprocess.call(cmd, shell=True)
        if code != 0:
            errmsg = "An error ocurred while executing {}."
            exception_handler(SubprocessError, errmsg.format(cmd), debug)
        assert os.path.isdir(image_dir)
        os.chdir(image_dir)
        print("Writing the top %d graphs in %s\n" % (len(regions), image_dir))
        try:
            for r in regions:
                if verbose: print("Computing the PNG image of {}".format(r))
                if args_obj.has_graphgenome():
                    get_region_graph(r,
                                     args_obj.chroms_prefix,
                                     args_obj.namemap,
                                     debug,
                                     graph_genome=args_obj.graph_genome)
                elif args_obj.has_graphgenome_dir():
                    get_region_graph(
                        r,
                        args_obj.chroms_prefix,
                        args_obj.namemap,
                        debug,
                        graph_genome_dir=args_obj.graph_genome_dir)
                else:
                    errmsg = "Unknown VG type. Unable to print regions PNG images.\n"
                    exception_handler(ValueError, errmsg, debug)
        except:
            errmsg = "An error occurred while computing PNG image of {}.\n"
            exception_handler(VGError, errmsg.format(r), debug)
    os.chdir(cwd)
Esempio n. 25
0
def get_region_graph(
    region: str,
    chroms_prefix: str,
    namemap: dict,
    debug: bool,
    graph_genome: Optional[str] = None,
    graph_genome_dir: Optional[str] = None,
) -> None:
    """Compute the PNG image of genomic regions encoded in genome variation 
    graph(s).

    ...

    Parameters
    ----------
    region : str
        genomic region
    chroms_prefix : str
        chromosome prefix
    namemap : dict
        chromosome names map
    debug : bool
        trace the full error stack
    graph_genome : str
        path to genome variation graph
    graph_genome_dir : str
        path to directory of genome variation graphs
    """

    if not isinstance(region, str):
        errmsg = "Expected str, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(region).__name__),
                          debug)
    if not isinstance(chroms_prefix, str):
        errmsg = "Expected str, got {}.\n"
        exception_handler(TypeError,
                          errmsg.format(type(chroms_prefix).__name__), debug)
    if not isinstance(namemap, dict):
        errmsg = "Expected dict, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(namemap).__name__),
                          debug)
    if graph_genome is None and graph_genome_dir is None:
        errmsg = "graph_genome and graph_genome_dir cannot be both None.\n"
        exception_handler(ValueError, errmsg, debug)
    if graph_genome is not None and graph_genome_dir is not None:
        errmsg = "graph_genome and graph_genome_dir cannot be both not None.\n"
        exception_handler(ValueError, errmsg, debug)
    if graph_genome is not None:
        if not isinstance(graph_genome, str):
            errmsg = "Expected str, got {}.\n"
            exception_handler(TypeError,
                              errmsg.format(type(graph_genome).__name__),
                              debug)
        if not os.path.isfile(graph_genome):
            errmsg = "Unable to locate {}.\n"
            exception_handler(FileNotFoundError, errmsg.format(graph_genome),
                              debug)
    if graph_genome_dir is not None:
        if not isinstance(graph_genome_dir, str):
            errmsg = "Expected str, got {}.\n"
            exception_handler(TypeError,
                              errmsg.format(type(graph_genome_dir).__name__),
                              debug)
        if not os.path.isdir(graph_genome_dir):
            errmsg = "Unable to locate {}."
            exception_handler(FileNotFoundError,
                              errmsg.format(graph_genome_dir), debug)

    if graph_genome and graph_genome_dir is None: has_graphgenome = True
    else:
        has_graphgenome = False  # graph_genome is None and graph_genome_dir == True

    if has_graphgenome:  # single genome variation graph
        vgregion = "".join([".", region, ".vg"])
        cmd = "vg find -x {} -E -p {} > {}".format(graph_genome, region,
                                                   vgregion)
        code = subprocess.call(cmd, shell=True)
        if code != 0:
            errmsg = "An error occurred while executing {}.\n"
            exception_handler(SubprocessError, errmsg.format(cmd), debug)
    else:  # has_graphgenome == False
        chrom = region.split(":")[0]
        if bool(namemap): chrom = namemap[chrom]
        chrname = "".join([chroms_prefix, chrom])
        xg = os.path.join(graph_genome_dir, ".".join([chrname, "xg"]))
        vgregion = "".join([".", region, ".vg"])
        cmd = "vg find -x {} -E -p {} > {}".format(xg, region, vgregion)
        code = subprocess.call(cmd, shell=True)
        if code != 0:
            errmsg = "An error occurred while executing {}.\n"
            exception_handler(SubprocessError, errmsg.format(cmd), debug)
    dotregion = "".join([".", region, ".dot"])
    cmd = "vg view {} -dp > {}".format(vgregion, dotregion)
    code = subprocess.call(cmd, shell=True)
    if code != 0:
        errmsg = "An error occurred while executing {}.\n"
        exception_handler(SubprocessError, errmsg.format(cmd), debug)
    pngimage = ".".join([region, "png"])
    cmd = "dot -Tpng {} -o {}".format(dotregion, pngimage)
    code = subprocess.call(cmd, shell=True)
    if code != 0:
        errmsg = "An error occurred while executing {}.\n"
        exception_handler(SubprocessError, errmsg.format(cmd), debug)
    # remove unused files
    cmd = "rm -rf .*.vg"
    code = subprocess.call(cmd, shell=True)
    if code != 0:
        errmsg = "An error occurred while executing {}.\n"
        exception_handler(SubprocessError, errmsg.format(cmd), debug)
    cmd = "rm -rf .*.dot"
    code = subprocess.call(cmd, shell=True)
    if code != 0:
        errmsg = "An error occurred while executing {}.\n"
        exception_handler(SubprocessError, errmsg.format(), debug)
Esempio n. 26
0
def read_JASPAR_motif(motif_file: str, bg_file: str, pseudocount: float,
                      no_reverse: bool, verbose: bool, debug: bool) -> Motif:
    """Read a motif PWM in JASPAR format.

    The data read are then used to build the scoring matrix for the 
    motif, the P-value matrix, etc.

    ...

    Parameters
    ----------
    motif_file : str
        path to the motif PWM in JASPAR format
    bg_file
        path to the background file in Markov Background Format
        (http://meme-suite.org/doc/bfile-format.html).
    pseudocount : float
        value to add to motif PWM counts
    no_reverse : bool
        if False only the forward strand will be considered, otherwise
        both forward and reverse are considered
    verbose : bool
        print additional information
    debug:
        trace the full error stack

    Returns
    -------
    Motif
        Motif object 
    """

    nucs: List[str] = list()
    counts: List[float] = list()
    if verbose:
        start_rm: float = time.time()
    try:
        ifstream = open(motif_file, mode="r")
        readlines = 0  # check for empty files
        # begin parsing
        header: str = str(ifstream.readline().strip()[1:])
        if not header:  # empty file?
            errmsg = "{} seems to empty.\n"
            exception_handler(IOError, errmsg.format(motif_file), debug)
        motifID, motifName = header.split('\t')[0:2]
        readlines += 1
        while True:
            line = ifstream.readline().strip()
            if not line: break  # EOF or empty file?
            nuc = line.strip()[:1]
            count = list(map(float, line.strip()[1:].split()[1:][:-1]))
            nucs.append(nuc.upper())
            counts.append(count)
            readlines += 1
        if readlines <= 1:  # only header read ?
            errmsg = "{} seems to be empty.\n"
            exception_handler(IOError, errmsg.format(motif_file), debug)
    except:
        errmsg = "An error occurred while reading {}.\n"
        exception_handler(MotifFileReadError, errmsg.format(motif_file), debug)
    else:
        if any([len(c) != len(counts[0]) for c in counts]):
            errmsg = "Motif counts width mismatch.\n"
            exception_handler(ValueError, errmsg, debug)
        nucsmap = dict()  # used with np object
        for i in range(len(nucs)):
            nucsmap.update({nucs[i]: i})
        motif_counts: pd.DataFrame = pd.DataFrame(
            data=counts, index=nucs)  # motif count matrix
        motif_width: int = int(len(counts[0]))
        alphabet: list = sorted(nucs)

        # compute background
        if bg_file == UNIF: bgs = get_uniformBG(alphabet, debug)
        elif os.path.isfile(bg_file): bgs = readBGfile(bg_file, debug)
        else:
            errmsg = "Unable to parse {}.\n"
            exception_handler(BGFileError, errmsg.format(bg_file), debug)
        bgs = pseudo_bg(bgs, no_reverse, debug)  # add pseudocount to bg

        # motif probability matrix
        motif_probs = (motif_counts / motif_counts.sum(0))
        motif_probs = norm_motif(motif_probs, motif_width, alphabet, debug)
        motif_probs = apply_pseudocount_jaspar(motif_counts.to_numpy(),
                                               motif_probs.to_numpy(),
                                               pseudocount, bgs, motif_width,
                                               alphabet, nucsmap, debug)
        motif: Motif = Motif(motif_probs, motif_width, alphabet, motifID,
                             motifName, nucsmap)
        motif.setBg(bgs)

        if verbose:
            end_rm: float = time.time()
            msg: str = "Read motif %s in %.2fs" % (motifID,
                                                   (end_rm - start_rm))
            print(msg)
    finally:
        ifstream.close()

    return motif
Esempio n. 27
0
def build_motif_MEME(motif_file: str, bg_file: str, pseudocount: float,
                     no_reverse: bool, cores: int, verbose: bool,
                     debug: bool) -> List[Motif]:
    """Read motif PWMs in MEME format.

    It is computed the scoring matrix from the values given with the PWM
    and the P-value matrix to assign a statistical significance to
    each motif occurrence candidate, based on the resulting log-odds
    score.

    ...

    Parameters:
    motif_file : str
        path to the motif PWM 
    bg_file
        path to the background file in Markov Background Format
        (http://meme-suite.org/doc/bfile-format.html).
    pseudocount : float
        value to add to motif PWM counts
    no_reverse : bool
        if False only the forward strand will be considered, otherwise
        both forward and reverse are considered
    cores : int
        number of CPU cores (used when MEME file has more than one PWM)
    verbose : bool
        print additional information
    debug : bool
        trace the full error stack

    Returns
    -------
    Motif
        Motif object storing the data contained in motif_file
    """

    if not isinstance(motif_file, str):
        errmsg = "Expected str, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(motif_file).__name__),
                          debug)
    if not os.path.isfile(motif_file):
        errmsg = "Unable to locate {}.\n"
        exception_handler(FileNotFoundError, errmsg.format(motif_file), debug)
    if not isMEME_ff(motif_file, debug):
        errmsg = "Required MEME motif PWM parsing, but {} is not in MEME format.\n"
        exception_handler(MotifFileFormatError, errmsg.format(motif_file),
                          debug)

    if verbose: start_rm_all: float = time.time()
    motif_lst: List[Motif] = read_MEME_motif(motif_file, bg_file, pseudocount,
                                             no_reverse, verbose, debug)
    motif_num: int = len(motif_lst)
    if verbose:
        end_rm_all: float = time.time()
        print("Read all motifs in %s in %.2fs." %
              (motif_file, (end_rm_all - start_rm_all)))
    print("\nRead {} motifs in {}".format(motif_num, motif_file))
    print("\nProcessing motifs\n")

    complete_motifs = list()  # fully processed motifs
    if verbose: start_mp_all: str = time.time()
    if motif_num >= cores:  # worth to use multiprocessing
        original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
        pool: mp.Pool = mp.Pool(processes=cores)
        # overwrite the default SIGINT handler to exit gracefully
        # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python
        signal.signal(signal.SIGINT, original_sigint_handler)

        try:
            args = [(motif, debug) for motif in motif_lst]
            res = (pool.starmap_async(process_motif_for_logodds, args))
            it: int = 0
            # ---- progress bar
            while (True):
                if res.ready():
                    # when finished call for the last time printProgressBar()
                    printProgressBar(tot,
                                     tot,
                                     prefix='Progress:',
                                     suffix='Complete',
                                     length=50)
                    break
                if it == 0: tot = res._number_left
                remaining = res._number_left
                printProgressBar((tot - remaining),
                                 tot,
                                 prefix='Progress:',
                                 suffix='Complete',
                                 length=50)
                time.sleep(1)
                it += 1
            complete_motifs += res.get(60 * 60 * 60)  # does not ignore signals
        except KeyboardInterrupt:
            pool.terminate()
            sigint_handler()
        else:
            pool.close()
            if verbose:
                end_mp_all: float = time.time()
                print("Processed motif(s) in %s in %.2fs" %
                      (motif_file, (end_mp_all - start_mp_all)))
            return complete_motifs
    else:
        for m in motif_lst:  # process each found motif
            complete_motifs.append(process_motif_for_logodds(m, debug))
        if verbose:
            end_mp_all: float = time.time()
            print("Processed motif(s) in %s in %.2fs" %
                  (motif_file, (end_mp_all - start_mp_all)))
        return complete_motifs
Esempio n. 28
0
def read_MEME_motif(motif_file: str, bg_file: str, pseudocount: float,
                    no_reverse: bool, verbose: bool,
                    debug: bool) -> List[Motif]:
    """Read motif PWM in MEME format.

    The data read are then used to build the scoring matrix for the 
    motif, the P-value matrix, etc.

    Since a MEME file can contain one or more motifs, for each stored PWM
    is built the corresponding Motif object. The resulting set of motifs are 
    stored in a list, which will constitute a MotifSet object.

    ...
    
    Parameters
    ----------
    motif_file : str
        path to the motif PWM in JASPAR format
    bg_file
        path to the background file in Markov Background Format
        (http://meme-suite.org/doc/bfile-format.html).
    pseudocount : float
        value to add to motif PWM counts
    no_reverse : bool
        if False only the forward strand will be considered, otherwise
        both forward and reverse are considered
    verbose : bool
        print additional information
    debug:
        trace the full error stack

    Returns
    -------
    List[Motif]
        list of Motif objects
    """

    if not isinstance(motif_file, str):
        errmsg = "Expected str, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(motif_file).__name__),
                          debug)
    if not os.path.isfile(motif_file):
        errmsg = "Unable to locate {}.\n"
        exception_handler(FileNotFoundError, errmsg.format(motif_file), debug)
    if not isinstance(bg_file, str):
        errmsg = "Expected str, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(bg_file).__name__),
                          debug)
    if bg_file != UNIF and not os.path.isfile(bg_file):
        errmsg = "Unable to locate {}.\n"
        exception_handler(FileNotFoundError, errmsg.format(bg_file), debug)
    if not isinstance(pseudocount, float):
        errmsg = "Expected float, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(pseudocount).__name__),
                          debug)
    if pseudocount <= 0:
        errmsg = "The pseudocount must be > 0.\n"
        exception_handler(ValueError, errmsg, debug)
    if not isinstance(no_reverse, bool):
        errmsg = "Expected bool, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(no_reverse).__name__),
                          debug)

    motifs_raw = list()
    motifs: List[Motif] = list()
    motifs_num = 0
    proceed = False
    # begin motif parsing
    try:
        ifstream = open(motif_file, mode="r")
        alphabet = __read_alphabet_meme(motif_file, ifstream,
                                        debug)  # shared by all motifs
        nucsmap = dict()  # used with np object
        for i in range(len(alphabet)):
            nucsmap.update({alphabet[i]: i})
        while True:
            for line in ifstream:
                if line.startswith("MOTIF"): break  # new motif instance
            else:
                assert motifs_num == len(motifs_raw)
                proceed = True
                break
            if proceed: break  # read all motifs
            if verbose: start_rm = time.time()
            motifids = line.split()
            if len(motifids) == 2:  # only name
                motif_id = motifids[1]
                motif_name = motif_id
            else:  # assume first two fieds: id, name
                motif_id, motif_name = motifids[1:3]
            statistics = __read_statistics_meme(motif_file, ifstream, debug)
            probs = __read_counts_meme(motif_file, ifstream,
                                       statistics["width"], debug)
            motifs_raw.append({
                "motifId": motif_id,
                "motifName": motif_name,
                "statistics": statistics,
                "counts": probs
            })
            motifs_num += 1
            if verbose:
                end_rm = time.time()
                print("Read motif %s in %.2fs." % (motif_name,
                                                   (end_rm - start_rm)))
        if not proceed:
            errmsg = "Unexpected premature EOF in {}.\n"
            exception_handler(EOFError, errmsg.format(motif_file), debug)
    except:
        errmsg = "An error occurred while reading {}.\n"
        exception_handler(MotifFileReadError, errmsg.format(motif_file), debug)
    else:
        if bg_file == UNIF: bgs = get_uniformBG(alphabet, debug)
        elif os.path.isfile(bg_file): bgs = readBGfile(bg_file, debug)
        else:
            errmsg = "Unable to parse {}.\n"
            exception_handler(BGFileError, errmsg.format(bg_file), debug)
        bgs = pseudo_bg(bgs, no_reverse, debug)  # add pseudocount to bg
        for i in range(motifs_num):
            mp = pd.DataFrame(np.matrix(motifs_raw[i]["counts"]))
            mp.index = alphabet
            mp = norm_motif(mp, motifs_raw[i]["statistics"]["width"], alphabet,
                            debug)
            mp = apply_pseudocount_meme(mp.to_numpy(), pseudocount,
                                        motifs_raw[i]["statistics"]["nsites"],
                                        motifs_raw[i]["statistics"]["width"],
                                        bgs, alphabet, nucsmap, debug)
            motif: Motif = Motif(mp, motifs_raw[i]["statistics"]["width"],
                                 alphabet, motifs_raw[i]["motifId"],
                                 motifs_raw[i]["motifName"], nucsmap)
            motif.setBg(bgs)
            motifs.append(motif)
    finally:
        ifstream.close()

    return motifs
Esempio n. 29
0
def main(cmdLineargs: Optional[List[str]] = None) -> None:

    try:
        # starting point of the execution time
        start: float = time.time()

        # read the command-line arguments
        parser: GRAFIMOArgumentParser = get_parser()

        if cmdLineargs is None:
            cmdLineargs: List[str] = sys.argv[1:]  # get input args

        # no arguments given --> print help
        if len(cmdLineargs) == 0:
            parser.error_noargs()
            die(2)

        # the second argument must be buildvg or findmotif
        if ((cmdLineargs[0] != "-h" and cmdLineargs[0] != "--help")
                and cmdLineargs[0] != "--version" and
            (cmdLineargs[0] != "buildvg" and cmdLineargs[0] != "findmotif")):
            parser.error(
                "The second argument must be one between 'buildvg' and 'findmotif'"
            )
            die(1)

        args: argparse.Namespace = parser.parse_args(cmdLineargs)

        if args.verbose:
            print("Parsing arguments...")
            start_args_parse: float = time.time()

        #--------------------------------------------------------------#
        # check commandline arguments consistency
        #

        #---------------------- general options -----------------------#

        # workflow type
        if args.workflow != "buildvg" and args.workflow != "findmotif":
            parser.error("Unexpected workflow given. Available options:\n"
                         "\tbuildvg: construct VG from user data.\n"
                         "\tfindmotif: scan VG for DNA motif(s) occurrences")
            die(1)

        # cpu cores
        if args.cores < 0:
            parser.error("Negative number of CPU cores given")
        elif args.cores == 0 and args.graph_genome:
            # when whole genome variation graph is given, it is safer to
            # use 1 CPU core by default. This beacuse of the space needed
            # to load the whole VG on RAM.
            #
            # CAVEAT: before requiring more CPU cores to be used, be sure
            # your system has enough memory
            args.cores = 1
        elif args.cores == 0:
            # default option -> use all available CPU cores
            args.cores = mp.cpu_count()
        else:  # args.cores > 0
            if args.cores > mp.cpu_count():
                parser.error("Too many CPU cores to use ({})".format(
                    args.cores))

        # verbosity
        if (not isinstance(args.verbose, bool)
                or (args.verbose != False and args.verbose != True)):
            parser.error(
                '\"--verbose\" does not accept any positional argument')

        # debugging
        if (not isinstance(args.debug, bool)
                or (args.debug != False and args.debug != True)):
            parser.error("\"--debug\" does not accept any positional argument")

        #---------------------- buildvg options -----------------------#

        buildvg_err_msg: str = "Unexpected arguments for \"grafimo buildvg\": \"{}\""

        if args.workflow == "buildvg":

            if args.graph_genome_dir:
                parser.error(buildvg_err_msg.format("-d, --genome-graph-dir"))
                die(1)
            elif args.graph_genome:
                parser.error(buildvg_err_msg.format("-g, --genome-graph"))
                die(1)
            elif args.bedfile:
                parser.error(buildvg_err_msg.format("-b, --bedfile"))
                die(1)
            elif args.motif:
                parser.error(buildvg_err_msg.format("-m, --motif"))
                die(1)
            elif args.bgfile != UNIF:  # if default ignored
                parser.error(buildvg_err_msg.format("-k, --bgfile"))
                die(1)
            elif args.pseudo != 0.1:  # if default ignored
                parser.error(buildvg_err_msg.format("-p, --pseudo"))
                die(1)
            elif args.threshold != 1e-4:  # if default ignored
                parser.error(buildvg_err_msg.format("-t, --thresh"))
                die(1)
            elif args.no_qvalue:
                parser.error(buildvg_err_msg.format("-q, --no-qvalue"))
                die(1)
            elif args.no_reverse:
                parser.error(buildvg_err_msg.format("-r, --no-reverse"))
                die(1)
            elif args.text_only:
                parser.error(buildvg_err_msg.format("-f, --text-only"))
                die(1)
            elif args.chroms_find:
                parser.error(buildvg_err_msg.format("--chroms-find"))
                die(1)
            elif args.chroms_prefix_find:
                parser.error(buildvg_err_msg.format("--chroms-prefix-find"))
                die(1)
            elif args.chroms_namemap_find != NOMAP:  # if default ignored
                parser.error(buildvg_err_msg.format("--chroms-namemap-find"))
                die(1)
            elif args.qval_t:
                parser.error(buildvg_err_msg.format("--qvalueT"))
                die(1)
            elif args.recomb:
                parser.error(buildvg_err_msg.format("--recomb"))
                die(1)
            elif args.top_graphs != 0:  # if default ignored
                parser.error(buildvg_err_msg.format("--top-graphs"))
                die(1)
            elif not args.linear_genome:
                parser.error("No reference genome given")
                die(1)
            elif not args.vcf:
                parser.error("No VCF file given")
                die(1)
            else:  # arguments for buildvg are correct
                # reference genome
                if (args.linear_genome.split('.')[-1] != 'fa'
                        and args.linear_genome.split('.')[-1] != 'fasta'):
                    parser.error(
                        "The reference genome file must be in FASTA format")
                    die(1)
                else:
                    if not os.path.isfile(args.linear_genome):
                        parser.error("Unable to find {}".format(
                            args.linear_genome))
                        die(1)
                    if os.stat(args.linear_genome).st_size == 0:  # empty file
                        parser.error("{} seems to be empty.".format(
                            args.linear_genome))
                        die(1)
                    args.linear_genome = os.path.abspath(args.linear_genome)
                # VCF --> the VCF file must have been compressed with
                # bgzip (https://github.com/samtools/tabix)
                if (args.vcf.split(".")[-1] != "gz"
                        and args.vcf.split(".")[-2] != "vcf"):
                    parser.error(
                        "Wrong VCF file given. The VCF file must have been "
                        "compressed with bgzip (e.g. myvcf.vcf.gz)")
                    die(1)
                else:
                    if not os.path.isfile(args.vcf):
                        parser.error('Unable to find {}'.format(args.vcf))
                        die(1)
                    if os.stat(args.vcf).st_size == 0:  # empty file
                        parser.error("{} seems to be empty.".format(args.vcf))
                        die(1)
                    args.vcf = os.path.abspath(args.vcf)

                # chromosome to construct VG
                if len(args.chroms_build) == 0:
                    args.chroms_build = [ALL_CHROMS]  # use all chromosome
                else:
                    if anydup(args.chroms_build):
                        parser.error(
                            "Duplicated chromosome names given to \"--chroms-build\""
                        )

                # chromosome name-map
                if args.chroms_namemap_build != NOMAP:
                    if not os.path.isfile(args.chroms_namemap_build):
                        parser.error("Unable to locate {}".format(
                            args.chroms_namemap_build))
                if (args.chroms_prefix_build
                        and args.chroms_namemap_build != NOMAP):
                    parser.error(
                        "\"--chroms-prefix-build\" and \"chroms-namemap-build\" "
                        "cannot used together. Choose one of those options")

                # if no out directory is specified the VGs are stored in
                # the current directory
                if args.out == "":
                    args.out = os.path.abspath("./")

                workflow: BuildVG = BuildVG(args)

                if args.verbose:
                    end_args_parse: float = time.time()
                    print("Arguments parsed in %.2fs." %
                          (end_args_parse - start_args_parse))
            # end if
        # end if

        #---------------------- findmotif options -----------------------#

        findmotif_err_msg: str = "Unexpected arguments for \"grafimo findmotif\": \"{}\""

        if args.workflow == "findmotif":
            if args.linear_genome:
                parser.error(findmotif_err_msg.format("-l, --linear-genome"))
                die(1)
            elif args.vcf:
                parser.error(findmotif_err_msg.format("-v, --vcf"))
                die(1)
            elif args.chroms_build:
                parser.error(findmotif_err_msg.format("--chroms-build"))
            elif args.chroms_prefix_build:
                parser.error(findmotif_err_msg.format("--chroms-prefix-build"))
            elif args.chroms_namemap_build != NOMAP:
                parser.error(
                    findmotif_err_msg.format("--chroms-namemap-build"))
            elif args.reindex:  # if default ignored
                parser.error(findmotif_err_msg.format("--reindex"))
                die(1)
            elif not args.graph_genome_dir and not args.graph_genome:
                parser.error(
                    "No arguments given for both \"--genome-graph\" and \"--genome-graph-dir\""
                )
                die(1)
            elif not args.bedfile:
                parser.error("No BED file given")
                die(1)
            elif not args.motif:
                parser.error("No motif PWM given")
                die(1)
            else:
                # only one between graph_genome and graph_genome_dir is allowed
                if args.graph_genome and args.graph_genome_dir:
                    parser.error(
                        "Only one argument between \"--genome-graph\" and \"--genome-graph-dir\""
                        " can be used")
                    die(1)

                # genome graph
                if args.graph_genome:
                    if (args.graph_genome.split('.')[-1] != "xg"
                            and args.graph_genome.split('.')[-1] != "vg"):
                        parser.error(
                            "Unrecognized genome variation graph format. Only"
                            "VG and XG format are allowed")
                        die(1)
                    elif not os.path.isfile(args.graph_genome):
                        parser.error("Unable to locate {}".format(
                            args.graph_genome))
                        die(1)
                    else:
                        # using absolute path avoid potential problems
                        args.graph_genome = os.path.abspath(args.graph_genome)

                # genome graphs directory
                if args.graph_genome_dir:
                    if not os.path.isdir(args.graph_genome_dir):
                        parser.error("Unable to locate {}".format(
                            args.graph_genome_dir))
                        die(1)
                    if len(glob(os.path.join(args.graph_genome_dir,
                                             "*.xg"))) <= 0:
                        parser.error(
                            "No genome variation graph found in {}".format(
                                args.graph_genome_dir))
                        die(1)
                    else:
                        # using absolute path avoid potential problems
                        args.graph_genome_dir = os.path.abspath(
                            args.graph_genome_dir)

                # BED file
                if args.bedfile:
                    if not isbed(args.bedfile, args.debug):
                        parser.error(
                            "The genomic coordinates must be given in UCSC BED files"
                        )
                        die(1)
                    else:
                        if not os.path.isfile(args.bedfile):
                            parser.error("Unable to locate {}".format(
                                args.bedfile))
                else:
                    parser.error("No BED file given")

                # motif pwm
                if not args.motif:
                    parser.error("No motif PWM given")

                else:
                    motifs: List[str] = args.motif
                    for m in motifs:
                        if not isMEME_ff(m, args.debug) and not isJaspar_ff(
                                m, args.debug):
                            parser.error(
                                "Unrecognized motif PWM file format. "
                                "{} does not follow the MEME or JASPAR format rules"
                                .format(m))
                            die(1)
                        if not os.path.isfile(m):
                            parser.error("Unable to locate {}".format(m))

                # background file
                if args.bgfile != UNIF:
                    if not os.path.isfile(args.bgfile):
                        parser.error("Unable to locate {}".format(args.bgfile))

                # pseudocount
                if args.pseudo <= 0:
                    parser.error(
                        "Pseudocount values must be > 0, got {}".format(
                            args.pseudo))
                    die(1)

                # statistical significance threshold
                if args.threshold <= 0 or args.threshold > 1:
                    parser.error(
                        "Motif statistical significance threshold must be between 0 and 1"
                    )
                    die(1)

                # q-value flag
                if (not isinstance(args.no_qvalue, bool) or
                    (args.no_qvalue != False and args.no_qvalue != True)):
                    parser.error(
                        "\"--qvalue\" accepts only True or False values")
                    die(1)

                # no reverse flag
                if (not isinstance(args.no_reverse, bool) or
                    (args.no_reverse != False and args.no_reverse != True)):
                    parser.error(
                        "\"--no-reverse\" accepts only True or False values")
                    die(1)

                # text only flag
                if (not isinstance(args.text_only, bool) or
                    (args.text_only != False and args.text_only != True)):
                    parser.error(
                        "\"--text-only\" accepts only True or False values")
                    die(1)

                # chromosome to consider during VG scan
                if len(args.chroms_find) == 0:
                    args.chroms_find = [ALL_CHROMS]  # use all chromosome
                else:
                    if anydup(args.chroms_find):
                        parser.error(
                            "Duplicated chromosome names given to \"--chroms-find\""
                        )

                # chromosome name-map
                if args.chroms_namemap_find != NOMAP:
                    if not os.path.isfile(args.chroms_namemap_find):
                        parser.error("Unable to locate {}".format(
                            args.chroms_namemap_find))
                if (args.chroms_prefix_find
                        and args.chroms_namemap_find != NOMAP):
                    parser.error(
                        "\"--chroms-prefix-find\" and \"chroms-namemap-find\" "
                        "cannot used together. Choose one of those options")

                # recomb flag
                if (not isinstance(args.recomb, bool)
                        or (args.recomb != False and args.recomb != True)):
                    parser.error(
                        "\"--recomb\" accepts only True or False values")
                    die(1)

                # out directory
                if args.out == "":  # default option
                    args.out = DEFAULT_OUTDIR
                    print(args.out)

                # threshold on q-value flag
                if (not isinstance(args.qval_t, bool)
                        or (args.qval_t != False and args.qval_t != True)):
                    parser.error(
                        "\"--qvalueT accepts only True or False values")
                    die(1)
                elif args.no_qvalue == True and args.qval_t == True:
                    parser.error(
                        "Unable to apply statistical significance threshold on"
                        " q-values if you don't want them")
                    die(1)

                # number of graph regions to store as PNG images
                if args.top_graphs < 0:
                    parser.error("Negative number of regions to display")

                workflow: Findmotif = Findmotif(args)

                if args.verbose:
                    end_args_parse: float = time.time()
                    print("Arguments parsed in %.2fs." %
                          (end_args_parse - start_args_parse))
            # end if
        # end if

        # chck that external dependencies are satisfied
        if args.verbose:
            sys.stderr.write(
                "Checking GRAFIMO external dependencies {}\n".format(EXT_DEPS))
            start_deps: float = time.time()
        satisfied: bool
        deps_lack: List[str]
        satisfied, deps_lack = check_deps()
        if not satisfied and len(deps_lack) > 0:
            errmsg = "Some dependencies are not satisfied: {}.\nPlease solve them before running GRAFIMO.\n"
            exception_handler(DependencyError, errmsg.format(deps_lack),
                              args.debug)
        elif not satisfied and len(deps_lack) <= 0:
            errmsg = "Dependencies satisfied, but unable to recover them.\n Be sure they are in system PATH.\n"
            exception_handler(DependencyError, errmsg, args.debug)

        if args.verbose and satisfied:
            end_deps: float = time.time()
            print("Dependencies satisfied.")
            print("Dependencies checked in %.2fs." % (end_deps - start_deps))

        #---------------------------------------------------------------
        # dependency check was ok, so we go to workflow selection:
        #   * construction of the genome variation graph for
        #     each chromosome or a user defined subset of them
        #   * scan of a precomputed VG or a set of precomputed VG
        if isinstance(workflow, BuildVG): buildvg(workflow, args.debug)
        elif isinstance(workflow, Findmotif): findmotif(workflow, args.debug)
        else:
            errmsg = "Expected BuildVG or Findmotif, got {}.\n"
            exception_handler(TypeError,
                              errmsg.format(type(workflow).__name__),
                              args.debug)

        end: float = time.time()  # GRAFIMO execution finishes here
        print("Elapsed time %.2fs." % (end - start))

    except KeyboardInterrupt:
        sigint_handler()
    finally:
        pass
Esempio n. 30
0
def scan_graph(
    motif: Motif,
    args_obj: Findmotif,
    debug: bool
) -> str:
    """Obtain all the sequences of length K from the genome variation graph. 
    K is the motif width.

    The k-mers are extracted from the genomic regions defined in a UCSC BED file
    or ENCODE narrowPeak file.

    By default are extracted only those k-mers found on the alterantive genome
    sequences encoded in the scanned genome variation graph(s). It is possible
    to consider all the possible recombinant which can be obtained from the set 
    of genetic variants encoded in the VG (--recomb option).

    To perform k-mers extraction are followed the paths (haplotypes) encoded in 
    VGs (defined as (V,E,P), where V are set of nodes, E the set of edges, and
    P the set of paths or the haplotypes).

    ...
    
    Parameters
    ----------
    motif : Motif 
        DNA motif
    args_obj : Findmotif  
        commandline arguments container
    debug : bool
        trace the full error stack
        
    Returns
    -------
    str 
        location of sequences files
    """

    if not isinstance(motif, Motif):
        errmsg = "Expected Motif, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(motif).__name__), debug)
    if not isinstance(args_obj, Findmotif):
        errmsg = "Expected Findmotif, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(args_obj).__name__), debug)

    if args_obj.has_graphgenome():  # single VG
        vg = args_obj.graph_genome
        if not isVGindexed(vg, debug):
            errmsg = "The genome variation graph is not indexed, index it before proceeding.\n"
            exception_handler(VGError, errmsg, debug)
    elif args_obj.has_graphgenome_dir():
        vg = args_obj.graph_genome_dir
    else:
        errmsg = "Unexpected genome variation graph given.\n"
        exception_handler(VGError, errmsg, debug)

    bedfile: str = args_obj.bedfile
    chroms: List[str] =  args_obj.chroms
    chroms_prefix: str = args_obj.chroms_prefix
    namemap: dict = args_obj.namemap
    cores: int = args_obj.cores
    motif_width: int = motif.width
    # modify global var value
    global verbose
    verbose = args_obj.verbose

    # sequence extraction begin
    try:
        print("\nExtracting regions defined in {}.\n".format(bedfile))
        if verbose: start_bp = time.time()
        regions, region_num = get_regions_bed(bedfile, debug)
        if verbose: 
            end_bp = time.time()
            print("%s parsed in %.2fs. Found %d regions.\n" % (bedfile, (end_bp - start_bp), region_num))
        if args_obj.chroms_num == 1 and chroms[0] == ALL_CHROMS:
            chroms = [c.split("chr")[1] for c in regions.keys()]
        tmpwd: str = tempfile.mkdtemp(prefix='grafimo_')  # create a tmp dir
        cwd: str = os.getcwd()  # get the current location 
        os.chdir(tmpwd)  # enter the tmp dir 
        # create a list of queries
        queries: List[str] = list()  
        # overwrite the default SIGINT handler to exit gracefully
        # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python
        original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
        pool: mp.Pool = mp.Pool(processes=cores)  # use no. cores processes
        signal.signal(signal.SIGINT, original_sigint_handler) 
        if args_obj.has_graphgenome_dir(): 
            for chrom in chroms:
                if not bool(namemap):
                    chrname = "".join([chroms_prefix, chrom])
                else:
                    try:
                        if chrom.startswith("chr"): 
                            chrname = namemap[chrom.split("chr")[1]]
                        else:
                            chrname = namemap[chrom]
                    except:
                        errmsg = "Missing out name map for chromosome {}.\n"
                        exception_handler(KeyError, errmsg.format(chrom), debug)
                if chrom.startswith("chr"): positions = regions[chrom]
                else: positions = regions["".join(["chr", chrom])]
                for pos in positions:
                    start: int = pos[0]
                    stop: int = pos[1]
                    if bool(namemap):
                        if chrom.startswith("chr"): c = namemap[chrom.split("chr")[1]]
                        else: c = chrom
                    elif chroms_prefix: c = chrname.split(chroms_prefix)[1]
                    else: c = chrname
                    region_index:str = "-".join(
                        [":".join([c, str(start)]), str(stop)]
                    )
                    region_name: str = "-".join(
                        ["_".join([chrname, str(start)]), str(stop)]
                    )
                    seqs: str = os.path.join(".", ".".join([region_name, "tsv"]))
                    xg: str = os.path.join(vg, ".".join([chrname, "xg"]))
                    # the GBWT must have the same prefix as XG
                    gbwt: str = os.path.join(vg, ".".join([chrname, "gbwt"]))  
                    if not os.path.isfile(xg):
                        errmsg = "Unable to locate {}. Are your VGs named with \"chr\"? Consider using --chroms-prefix-find or chroms-namemap-find.\n"
                        exception_handler(VGError, errmsg.format(xg), debug)
                    if not os.path.isfile(gbwt):
                        errmsg = "Unable to locate {}. Are your VGs named with \"chr\"? Consider using --chroms-prefix-find or chroms-namemap-find.\n"
                        exception_handler(VGError, errmsg.format(gbwt), debug)
                    query: str = "vg find -p {} -x {} -H {} -K {} -E > {}".format(
                        region_index, xg, gbwt, motif_width, seqs
                    )
                    queries.append(query)
            get_kmers(queries, pool, debug, verbose)

        elif args_obj.has_graphgenome():
            for chrom in chroms:
                if not bool(namemap):
                    chrname = "".join([chroms_prefix, chrom])
                else:
                    try:
                        chrname = namemap[chrom]
                    except:
                        errmsg = "Missing out name map for chromosome {}.\n"
                        exception_handler(KeyError, errmsg.format(chrom), debug)
                if chrom.startswith("chr"): 
                    if chrom not in regions.keys():
                        errmsg = "{} does not appear among the chromosomes available in {}.\n"
                        exception_handler(KeyError, errmsg.format(chrom, bedfile), debug)
                    positions = regions[chrom]
                else:
                    if ("".join(["chr", chrom])) not in regions.keys():
                        errmsg = "{} does not appear among the chromosomes available in {}.\n"
                        exception_handler(KeyError, errmsg.format(chrom, bedfile), debug) 
                    positions = regions["".join(["chr", chrom])]
                for pos in positions:
                    start: int = pos[0]
                    stop: int = pos[1]
                    if chroms_prefix: c = chrname.split(chroms_prefix)[1]
                    else: c = chrname
                    region_index:str = "-".join(
                        [":".join([c, str(start)]), str(stop)]
                    )
                    region_name: str = "-".join(
                        ["_".join([chrname, str(start)]), str(stop)]
                    )
                    seqs: str = os.path.join(".", ".".join([region_name, "tsv"]))
                    xg: str = vg
                    xg_prefix: str = xg.split(".xg")[0]
                    # the GBWT must have the same prefix as XG
                    gbwt: str = ".".join([xg_prefix, "gbwt"]) 
                    if not os.path.exists(xg):
                        errmsg = "Unable to locate {}. Are your VGs named with \"chr\"? Consider using --chroms-prefix-find or chroms-namemap-find.\n"
                        exception_handler(VGError, errmsg.format(xg), debug)
                    if not os.path.isfile(gbwt):
                        errmsg = "Unable to locate {}. Are your VGs named with \"chr\"? Consider using --chroms-prefix-find or chroms-namemap-find.\n"
                        exception_handler(VGError, errmsg.format(gbwt), debug)
                    query: str = "vg find -p {} -x {} -H {} -K {} -E > {}".format(
                        region_index, xg, gbwt, motif_width, seqs
                    )
                    queries.append(query)
            get_kmers(queries, pool, verbose)
    except:
        errmsg = "An error occurred while scanning {}.\n"
        if args_obj.has_graphgenome_dir(): 
            exception_handler(VGError, errmsg.format(xg), debug)
        elif args_obj.has_graphgenome(): 
            exception_handler(VGError, errmsg.format(vg), debug)
        else:
            errmsg = "Chromosome name mismatch. Check chromosome name consistency.\n"
            exception_handler(VGError, errmsg, debug)
    sequence_loc: str = os.getcwd()  
    os.chdir(cwd) 

    return sequence_loc