Ejemplo n.º 1
0
def get_reference_genome_from_ucsc():
    """
        Download the reference genome (hg38 assembly), from the UCSC
        database, in the current directory and returns the path to it
        ----
        Parameters:
            None
        ----
        Returns:
            genome (str) : path to the genome downloaded (in .fa format)
    """

    # download genome
    address = 'ftp://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz'
    cmd = 'wget -c {0}'.format(address)
    code = subprocess.call(cmd, shell=True)  # downloaded in the current directory

    if code != 0:
        errmsg = ''.join(["\n\nERROR: an error occurred while executing ", cmd, ". Exiting"])
        raise SubprocessError(errmsg)
        die(1)

    # decompress genome
    print("Uncompressing the genome...")

    genome_comp = './hg38.fa.gz'

    cmd = 'gunzip {0}'.format(genome_comp)
    code = subprocess.call(cmd, shell=True)

    if code != 0:
        errmsg = ''.join(["\n\nERROR: an error occurred while executing ", cmd, ". Exiting"])
        raise SubprocessError(errmsg)
        die(1)

    # remove FASTA.GZ file if still present
    if os.path.exists(genome_comp):
        cmd = 'rm {0}'.format(genome_comp)
        code = subprocess.call(cmd, shell=True)

        if code != 0:
            errmsg = ''.join(["\n\nERROR: an error occurred while executing ", cmd, ". Exiting"])
            raise SubprocessError(errmsg)
            die(1)

    # get the path to the genome file
    genome_uncomp = "./hg38.fa"
    genome = os.path.abspath(genome_uncomp)

    return genome
Ejemplo n.º 2
0
def get_1000GProject_vcf():
    """
        Downloads a VCF file from the 1000 Genome Project database,
        containing SNPs and indels for each subject involved in the
        study and returns the path to it
        ----
        Parameters:
            None
        ----
        Returns:
            vcf (str) : path to the vcf downloaded vcf file (in .vcf.gz)
    """

    # download the VCF
    address = 'ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/'
    address += '1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/'
    address += 'ALL.wgs.shapeit2_integrated_snvindels_v2a.GRCh38.27022019.sites.vcf.gz'

    cmd = 'wget -c {0}'.format(address)

    code = subprocess.call(cmd, shell=True)

    if code != 0:
        errmsg = ''.join(["\n\nERROR: An error occurred while executing ", cmd, ". Exiting"])
        raise SubprocessError(errmsg)
        die(1)

    vcf_file = './ALL.wgs.shapeit2_integrated_snvindels_v2a.GRCh38.27022019.sites.vcf.gz'
    vcf = os.path.abspath(vcf_file)

    return vcf
Ejemplo n.º 3
0
def get_1000GProject_vcf() -> str:
    """Downloads a WGS VCF file from the 1000 Genome Project database
    (phase 3), containing SNVs and indels. The present file is used for 
    VG construction and graph indexing test purposes. 
    
    Since the variants present in this file are not phased, it cannot be 
    used to build the GBWT index and the corresponding haplotypes cannot
    be used. To use this features we must phase the VCF.

    Parameters
    ----------
    
    Returns
    -------
    str
        path to the downloaded VCF file (compressed)
    """

    address: str
    cmd: str
    code: int
    errmsg: str

    # download the VCF
    address = 'ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/'
    address += '1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/'
    address += 'ALL.wgs.shapeit2_integrated_snvindels_v2a.GRCh38.27022019.sites.vcf.gz'

    cmd = 'wget -c {0}'.format(address)

    code = subprocess.call(cmd, shell=True)

    if code != 0:
        errmsg = ''.join([
            "\n\nERROR: An error occurred while executing ", cmd, ". Exiting"
        ])
        raise SubprocessError(errmsg)

    vcf_file: str = './ALL.wgs.shapeit2_integrated_snvindels_v2a.GRCh38.27022019.sites.vcf.gz'
    vcf: str = os.path.abspath(vcf_file)

    return vcf
Ejemplo n.º 4
0
def compute_results(motif: Motif,
                    sequence_loc: str,
                    args_obj: Optional[Findmotif] = None,
                    testmode: Optional[bool] = False
) -> pd.DataFrame:
    """Score all the sequences extracted from the genome variation graph
    in the regions defined in the input BED file.

    To score the sequences is used the scaled motif scoring matrix, 
    stored in the input Motif instance.

    To each score is assigned a P-value using the P-value matrix, 
    contained in the Motif instance.
    
    Parameters
    ----------
    motif : Motif
        motif data to score sequences
    sequence_loc : str
        path to the intermediate files containing the sequences 
        extracted from the genome variation graph
    args_obj : Findmotif, optional
        container for the arguments needed during the scoring step
    testmode : bool, optional
        flag value manually set used for test purposes

    Returns
    -------
    pandas.DataFrame
        scoring results
    """

    cores:int
    threshold: float
    no_qvalue: bool
    qval_t: bool
    no_reverse: bool
    recomb: bool
    verbose: bool
    errmsg: str

    if not isinstance(sequence_loc, str):
        errmsg = ''.join(["\n\nERROR: unable to locate extracted sequences in ", 
                          sequence_loc])
        raise FileNotFoundError(errmsg)

    if not isinstance(motif, Motif):
        errmsg = "\n\nERROR: the given motif is not an instance of Motif"
        raise ValueError(errmsg)

    if not testmode:
        if not isinstance(args_obj, Findmotif):
            errmsg = "\n\nERROR: unrecognized argument object type"
            raise ValueError(errmsg)

    if not testmode:
        cores = args_obj.get_cores()
        threshold = args_obj.get_threshold()
        no_qvalue = args_obj.get_no_qvalue()
        qval_t = args_obj.get_qvalueT()
        no_reverse = args_obj.get_no_reverse()
        recomb = args_obj.get_recomb()
        verbose = args_obj.get_verbose()
    else:
        cores = 1
        threshold = 1
        recomb = True
        no_qvalue = False
        qval_t = False
        no_reverse = False
        verbose = False

    assert threshold > 0
    assert threshold <= 1
    assert cores >= 1

    print_scoring_msg(no_reverse, motif)

    cwd: str = os.getcwd()
    os.chdir(sequence_loc)

    manager: SyncManager = mp.Manager()
    # results
    return_dict: DictProxy = manager.dict()
    # scanned nucleotides
    scanned_nucs_dict: DictProxy = manager.dict()
    # scanned sequences  
    scanned_seqs_dict: DictProxy = manager.dict()  

    # get all tmp files containing sequences
    sequences: List[str] = glob.glob('*.tsv')  
    if len(sequences) < cores:
        cores = len(sequences)
    # split the sequence set in no. cores chunks
    sequences_split: List[str] = np.array_split(sequences, cores)  

    jobs = list()  # jobs list
    proc_finished: int = 0 

    original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
    # overwrite the default SIGINT handler to exit gracefully
    # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python
    signal.signal(signal.SIGINT, original_sigint_handler)  
    

    if verbose:
        start_s: float = time.time()

    try:

        # compute results in parallel
        for i in range(cores):
            p = mp.Process(
                target=score_seqs, args=(
                    sequences_split[i], motif, no_reverse, return_dict, 
                    scanned_seqs_dict, scanned_nucs_dict, i
                    )
                )
            jobs.append(p)
            p.start()  
        # end for

        # to print 0%, otherwise start from  % as first chunk id already completed completed
        printProgressBar(proc_finished, cores, prefix='Progress:',
                         suffix='Complete', length=50)
        for job in jobs:
            job.join()  # sync point
            proc_finished += 1
            printProgressBar(proc_finished, cores, prefix='Progress:',
                             suffix='Complete', length=50)
        # end for
    
    except KeyboardInterrupt:
        sigint_handler()
        sys.exit(2)

    else:
        if verbose:
            end_s: float = time.time()
            print(
                "Scored all sequences in %.2fs" % (end_s - start_s)
            )

        else:
            pass # all was OK, go to the next instruction
    
    # end try

    os.chdir(cwd) 

    if not testmode:
        cmd: str = "rm -rf {0}".format(sequence_loc)
        code: int = subprocess.call(cmd, shell=True)

        if code != 0:
            errmsg = "\n\nERROR: an error occurred while running %s" % cmd
            raise SubprocessError(errmsg)
    

    if verbose:
        start_df: str = time.time()

    # recover all analysis results and summarize them in a single 
    # data structure
    seqnames: List[str] = list()
    seqs: List[str] = list()
    chroms: List[str] = list()
    starts: List[int] = list()
    stops: List[int] = list()
    strands: List[str] = list()
    scores: List[np.double] = list()
    pvalues: List[np.double] = list()
    frequencies: List[int] = list()
    references: List[str] = list()

    seqs_scanned: int = 0
    nucs_scanned: int = 0

    for key in return_dict.keys():

        assert isinstance(return_dict[key], ResultTmp)

        seqnames += return_dict[key].get_seqnames()
        seqs += return_dict[key].get_seqs()
        chroms += return_dict[key].get_chroms()
        starts += return_dict[key].get_starts()
        stops += return_dict[key].get_stops()
        strands += return_dict[key].get_strands()
        scores += return_dict[key].get_scores()
        pvalues += return_dict[key].get_pvalues()
        frequencies += return_dict[key].get_frequencies()
        references += return_dict[key].get_references()

        # compute the total number of scanned sequences and nucleotides
        seqs_scanned += scanned_seqs_dict[key]  # the keys are the same as return_dict
        nucs_scanned += scanned_nucs_dict[key]  # the keys are the same as return_dict
    # end for

    qvalues: List[np.double]
    # compute the q-values
    if no_qvalue:
        qvalues = list()  # empty list -> not computed
    else:
        qvalues = compute_qvalues(pvalues)
    # end if

    print("Scanned sequences:", seqs_scanned)
    print("Scanned nucleotides:", nucs_scanned)

    # summarize results in a pandas DF
    finaldf: pd.DataFrame = build_df(motif, seqnames, starts, stops, strands, 
                                     scores, pvalues, qvalues, seqs, frequencies, 
                                     references, threshold, qval_t, no_qvalue, 
                                     recomb)

    if verbose:
        end_df: float = time.time()
        print("\nResults summary built in %.2fs" % (end_df - start_df))
    
    return finaldf
Ejemplo n.º 5
0
def compute_results(motif, sequence_loc, args_obj):
    """
        Score all the sequences extracted from regions defined in the
        input BED file.
        To score sequences is used the processed input motif.
        The results are then stored in a pandas DataFrame
        ----
        Parameters:
            motif (Motif) : processed motif, used to score sequences
            sequence_loc (str) : path to temporary files storing sequences extracted
                                    during the previous step
            args_obj (Findmotif) : arguments used during the sequnece scoring step
        ----
        Returns:
            finaldf (pd.DataFrame) : pandas DataFrame containing the results of
                                        the GRAFIMO analysis
    """

    if not isinstance(sequence_loc, str):
        errmsg = ''.join("\n\nERROR: unable to locate extracted sequences in ",
                         sequence_loc, ". Exiting")
        raise FileNotFoundError(errmsg)

    if not isinstance(motif, Motif):
        raise ValueError(
            "\n\nERROR: the given motif is not an instance of Motif")

    if not isinstance(args_obj, Findmotif):
        raise ValueError("\n\nERROR: unrecognized argument object type")

    # reading arguments
    cores = args_obj.get_cores()
    threshold = args_obj.get_threshold()
    no_qvalue = args_obj.get_no_qvalue()
    qval_t = args_obj.get_qvalueT()
    no_reverse = args_obj.get_no_reverse()
    verbose = args_obj.get_verbose()

    assert threshold > 0
    assert threshold <= 1
    assert cores >= 1

    print_scoring_msg(no_reverse, motif)

    cwd = os.getcwd()
    os.chdir(sequence_loc)  # go to sequence location

    manager = mp.Manager()
    return_dict = manager.dict()  # results
    scanned_nucs_dict = manager.dict()  # nucleotides scanned
    scanned_seqs_dict = manager.dict()  # sequences scanned

    sequences = glob.glob('*.tsv')  # get all tmp files containing sequences
    sequences_split = np.array_split(
        sequences, cores)  # split the sequence set in #cores chunks

    jobs = []  # jobs list
    proc_finished = 0  # number of jobs done

    original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
    signal.signal(signal.SIGINT, original_sigint_handler
                  )  # overwrite the default SIGINT handler to exit gracefully
    # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python

    if verbose:
        start_s = time.time()

    try:

        # compute results in parallel
        for i in range(cores):
            p = mp.Process(target=score_seqs,
                           args=(sequences_split[i], motif, no_reverse,
                                 return_dict, scanned_seqs_dict,
                                 scanned_nucs_dict, i))
            jobs.append(p)
            p.start()  # start the process
        # end for

        # to print 0%, otherwise start from  % as first chunk id already completed completed
        printProgressBar(proc_finished,
                         cores,
                         prefix='Progress:',
                         suffix='Complete',
                         length=50)
        for job in jobs:
            job.join()  # sync point
            proc_finished += 1
            printProgressBar(proc_finished,
                             cores,
                             prefix='Progress:',
                             suffix='Complete',
                             length=50)
        # end for
    except KeyboardInterrupt:
        sigint_handler()
        sys.exit(2)

    else:

        if verbose:
            end_s = time.time()
            msg = ''.join(
                ["\nScored all sequences in ",
                 str(end_s - start_s), "s"])
            print(msg)

        else:
            pass  # all was OK, go to the next instruction
        # end if
    # end try

    os.chdir(cwd)  # get back to starting point

    cmd = "rm -rf {0}".format(sequence_loc)  # remove temporary sequence files
    code = subprocess.call(cmd, shell=True)

    if code != 0:
        msg = ' '.join(["\n\nERROR: an error occurred while running", cmd])
        raise SubprocessError(msg)
    # end if

    if verbose:
        start_df = time.time()

    # recover all analysis results and summarize them in a single data-structure
    seqnames = []
    seqs = []
    chroms = []
    starts = []
    stops = []
    strands = []
    scores = []
    pvalues = []
    references = []

    seqs_scanned = 0
    nucs_scanned = 0

    for key in return_dict.keys():

        assert isinstance(return_dict[key], ResultTmp)

        seqnames += return_dict[key].get_seqnames()
        seqs += return_dict[key].get_seqs()
        chroms += return_dict[key].get_chroms()
        starts += return_dict[key].get_starts()
        stops += return_dict[key].get_stops()
        strands += return_dict[key].get_strands()
        scores += return_dict[key].get_scores()
        pvalues += return_dict[key].get_pvalues()
        references += return_dict[key].get_references()

        # compute the total number of scanned sequences and nucleotides
        seqs_scanned += scanned_seqs_dict[
            key]  # the keys are the same as return_dict
        nucs_scanned += scanned_nucs_dict[
            key]  # the keys are the same as return_dict
    # end for

    # compute the q-values
    if no_qvalue:
        qvalues = []  # empty list -> not computed
    else:
        qvalues = compute_qvalues(pvalues)
    # end if

    print("Scanned sequences:", seqs_scanned)
    print("Scanned nucleotides:", nucs_scanned)

    # summarize results in a pandas DF
    finaldf = build_df(motif, seqnames, starts, stops, strands, scores,
                       pvalues, qvalues, seqs, references, threshold, qval_t,
                       no_qvalue)

    if verbose:
        end_df = time.time()
        msg = ''.join(
            ["\nBuilt result summary in ",
             str(end_df - start_df), "s"])

    return finaldf
Ejemplo n.º 6
0
def getRegion_graph(region: str, genome_loc: str) -> None:
    """Get a PNG image representing the queried region of the genome
    variation graph.

    Parameters
    ----------
    region : str
        region for which the PNG image will be obtained
    genome_loc : str
        path to the genome variation graph(s)
    """

    errmsg: str
    cmd: str
    code: int
    vg_region: str

    if genome_loc.split('.')[-1] == 'xg':
        # we have the whole genome graph

        if not os.path.isfile(genome_loc):
            errmsg = ' '.join(
                ["\n\nERROR: Unable to locate", genome_loc, "\n"])
            raise Exception(errmsg)

        # extract the region PNG image
        vg_region = ''.join([".", region, ".vg"])
        cmd = "vg find -x {0} -E -p {1} > {2}".format(genome_loc, region,
                                                      vg_region)
        code = subprocess.call(cmd, shell=True)
        if code != 0:
            raise SubprocessError("\n\nERROR: unable to execute %s" % cmd)

    else:
        # we have a directory containing the genome graphs

        if not os.path.isdir(genome_loc):
            raise Exception("\n\nERROR: nable to locate %s" % genome_loc)

        # if given separate VGs (as built by GRAFIMO) they are called
        # chr1.xg, chr20.xg, chrX.xg, etc.
        xg: str = ''.join(["chr", region.split(':')[0], '.xg'])

        xg = os.path.join(genome_loc, xg)

        # extract the PNG image of the region
        vg_region = ''.join([".", region, ".vg"])
        cmd = "vg find -x {0} -E -p {1} > {2}".format(xg, region, vg_region)
        code = subprocess.call(cmd, shell=True)
        if code != 0:
            raise SubprocessError("\n\nERROR: unable to execute %s" % cmd)
    # end if

    dot_region: str = ''.join([".", region, ".dot"])
    cmd = "vg view {0} -dp > {1}".format(vg_region, dot_region)
    code = subprocess.call(cmd, shell=True)
    if code != 0:
        raise SubprocessError("\n\nERROR: unable to execute %s" % cmd)

    png_image: str = ''.join([region, '.png'])
    cmd = "dot -Tpng {0} -o {1}".format(dot_region, png_image)
    code = subprocess.call(cmd, shell=True)
    if code != 0:
        raise SubprocessError("\n\nERROR: unable to execute %s" % cmd)

    # clean the directory from unused files
    cmd = "rm .*.vg"
    code = subprocess.call(cmd, shell=True)
    if code != 0:
        raise SubprocessError("\n\nERROR: unable to execute %s" % cmd)

    cmd = "rm .*.dot"
    code = subprocess.call(cmd, shell=True)
    if code != 0:
        raise SubprocessError("\n\nERROR: unable to execute %s" % cmd)
Ejemplo n.º 7
0
def write_results(results: pd.DataFrame, motif_id: str, motif_num: int,
                  args_obj: Findmotif) -> None:
    """Write GRAFIMO analysis results in three files (TSV report, HTML 
    report, GFF3 file).

    To first two reports contain in a tabular format the motif occurrence
    candidates retrieved by GRAFIMO, with scores, P-values, q-values, etc.
    for each table entry.

    The third file contains a suitabl input for custom track on the UCSC
    genome browser.

    The user can also ask to display the results directly on the terminal
    enabling the correspondent flag, when calling GRAFIMO in command-line.

    Parameters
    ----------
    results : pandas.DataFrame
        results of GRAFIMO analysis
    motif_id : str
        motif ID
    motif_num : int
        number of searched motifs
    args_obj : Findmotif
        container for arguments needed to store the results
    """

    errmsg: str
    if not isinstance(args_obj, Findmotif):
        errmsg = "\n\nERROR: incorrect data-type. Exiting"
        raise ValueError(errmsg)

    if not isinstance(results, pd.DataFrame):
        errmsg = "\n\nERROR: results must be stored in a pandas DataFrame"
        raise NoDataFrameException(errmsg)

    # get resuls storing arguments
    outdir: str = args_obj.get_outdir()
    no_qvalue: bool = args_obj.get_no_qvalue()
    top_graphs: int = args_obj.get_top_graphs()
    verbose: bool = args_obj.get_verbose()

    vg: str
    if args_obj.has_graph_genome():
        vg = args_obj.get_graph_genome()
    elif args_obj.has_graph_genome_dir():
        vg = args_obj.get_graph_genome_dir()
    else:
        errmsg = "\n\nERROR: no VG given"
        raise VGException(errmsg)

    dirname_default: bool = False

    cwd: str = os.getcwd()

    if outdir == DEFAULT_OUTDIR:
        # to make unique the output directory we add the PID
        # to the name.
        #
        # This is useful when calling grafimo in different runs on the
        # same machine.

        # append the PID and the motif ID
        outdir = '_'.join(["grafimo_out", str(os.getpid()), motif_id])
        dirname_default = True
    # end if

    cmd: str
    code: int
    if not os.path.isdir(outdir):
        cmd = 'mkdir -p {0}'.format(outdir)
        code = subprocess.call(cmd, shell=True)

        if code != 0:
            errmsg = ''.join(
                ["\n\nERROR: An error occurred while executing ", cmd, "\n"])
            raise SubprocessError(errmsg)

        os.chdir(outdir)

    else:
        os.chdir(outdir)
        # NB the content of the directory will be automatically
        # overwritten
    # end if

    print("\nWriting results in %s\n" % outdir)

    prefix: str
    # get the filename prefix
    if not dirname_default and motif_num > 1:
        # each file is labeled with the motif ID
        prefix = '_'.join(['grafimo_out', motif_id])
    else:
        prefix = 'grafimo_out'

    if verbose:
        start_tsv: float = time.time()

    # write the TSV
    results.to_csv(''.join([prefix, '.tsv']), sep='\t', encoding='utf-8')

    if verbose:
        end_tsv: float = time.time()
        print("%s.tsv written in %.2fs" % (prefix, (end_tsv - start_tsv)))
        start_html: float = time.time()

    # write the HTML
    results.to_html(''.join([prefix, '.html']))

    if verbose:
        end_html: float = time.time()
        print("%s.html written in %.2fs" % (prefix, (end_html - start_html)))
        start_gff: float = time.time()

    # write the GFF3
    writeGFF3(prefix, results, no_qvalue)

    if verbose:
        end_gff: float = time.time()
        print("%s.gff written in %.2fs" % (prefix, (end_gff - start_gff)))

    # get the graphs of the top n regions
    regions: List[str]
    if top_graphs > 0:
        regions = results['sequence_name'].to_list()

        # the results are empty
        if len(regions) == 0:
            print(
                "WARNING: no region was available. Are your results empty?\n")
            os.chdir(cwd)
            return

        # get the n different top regions' graph
        regions = unique_lst(regions, size=top_graphs)

        if verbose:
            print("Extracting %d region variation graphs" % top_graphs)

        if len(regions) < top_graphs:
            top_graphs = len(regions)
            print("WARNING: possible to visualize only the top %d regions\n" %
                  (top_graphs))

        # create the directory for the regions pictures
        image_dir: str
        if motif_num > 1:
            image_dir = '_'.join(["top_graphs", motif_id])
        else:
            image_dir = "top_graphs"

        if verbose:
            print("Graphs will be stored in %s" % image_dir)

        cmd = "mkdir -p {0}".format(image_dir)
        code = subprocess.call(cmd, shell=True)
        if code != 0:
            errmsg = ' '.join(
                ["\n\nERROR: an error occurred while executing", cmd, "\n"])
            raise SubprocessError()

        os.chdir(image_dir)

        print("Writing the top %d graphs in %s\n" % (top_graphs, image_dir))

        for i in range(top_graphs):
            region: str = regions[i]
            # the VG accepts only queries like 1:100-200 and not like
            # chr1:100-200
            region = region.split("chr")[1]

            if verbose:
                print("Computing the PNG image of %s" % region)

            getRegion_graph(region, vg)

    os.chdir(cwd)
Ejemplo n.º 8
0
def get_reference_genome_from_ucsc() -> str:
    """Download the reference genome (hg38 assembly), from the UCSC
    database, in the current working directory and returns the path to 
    the corresponding FASTA file.

    This function has been written only for test purposes
    
    Parameters
    ----------

    Returns
    -------
    str
        path to the downloaded FASTA file (in .fa format)
    """

    cmd: str
    code: int
    errmsg: str

    # download genome
    address = 'ftp://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz'
    cmd = 'wget -c {0}'.format(address)
    # the genome will be downloaded in the current directory
    code = subprocess.call(cmd, shell=True)
    if code != 0:
        errmsg = ''.join([
            "\n\nERROR: an error occurred while executing ", cmd, ". Exiting"
        ])
        raise SubprocessError(errmsg)

    # decompress genome
    print("Uncompressing the genome...")

    genome_comp: str = './hg38.fa.gz'
    if not os.path.exists(genome_comp):
        errmsg = ''.join(["\n\nERROR: ", genome_comp, " not found"])
        raise FileNotFoundError(errmsg)

    cmd = 'gunzip {0}'.format(genome_comp)
    code = subprocess.call(cmd, shell=True)

    if code != 0:
        errmsg = ''.join([
            "\n\nERROR: an error occurred while executing ", cmd, ". Exiting"
        ])
        raise SubprocessError(errmsg)

    # remove FASTA.GZ file if still present
    if os.path.exists(genome_comp):
        cmd = 'rm {0}'.format(genome_comp)
        code = subprocess.call(cmd, shell=True)

        if code != 0:
            errmsg = ''.join([
                "\n\nERROR: an error occurred while executing ", cmd,
                ". Exiting"
            ])
            raise SubprocessError(errmsg)

    # get the path to the genome file
    genome_uncomp: str = "./hg38.fa"
    assert os.path.exists(genome_uncomp)
    genome: str = os.path.abspath(genome_uncomp)

    return genome
Ejemplo n.º 9
0
def construct_vg(buildvg_args: BuildVG) -> None:
    """ Create the genome graph from the given genome reference and 
    phased VCF file given.
    
    The genome is not built as a single whole genome graph but a
    single graph is constructed for each chromosome.
    This approach avoids memory issues and allows the genome variation
    graph construction also on machines with less resources.

    There is NO drawback using this approach with respect to build
    a whole genome graph and query it.
    
    Moreover, it allows parallel queries on the different chromosomes to
    be perfromed also on regular laptops (>= 16 GB of memory), which is 
    very difficult with a whole genome graph, that requires the user
    to set appropriately the number of cores to use. Anyway a whole
    genome graph can be queried using a regular laptop using one core.
    
    Parameters
    ----------
        buildvg_args : BuildVG
            container for the arguments required to build the genome 
            variation graph 
    """
    errmsg: str
    if not isinstance(buildvg_args, BuildVG):
        errmsg = "Unknown arguments object type. "
        errmsg += "Cannot build the genome variation graph. Exiting"
        raise ValueError(errmsg)

    # read the arguments to build the VGs
    reindex: bool = buildvg_args.get_reindex()
    chroms: List[str] = buildvg_args.get_chroms()
    threads: int = buildvg_args.get_cores()
    outdir: str = buildvg_args.get_outdir()
    verbose: bool = buildvg_args.get_verbose()
    test: bool = buildvg_args.get_test()  # manually set in the code

    reference: str
    vcf: str

    if test:
        reference = get_reference_genome_from_ucsc()
        vcf = get_1000GProject_vcf()

    else:
        reference = buildvg_args.get_reference_genome()
        vcf = buildvg_args.get_vcf()
    # end if

    if verbose:
        print("using reference genome: ", reference)
        print("Using VCF file: ", vcf, "\n\n")
    # end if

    msg: str

    if verbose:
        start_c: float = time.time()
        msg = "Reading chromosome from reference file for which will be built the VG..."
        print(msg)

    # read the chromosome present in the used reference file
    chroms_available: List[str] = get_chromlist(reference)

    if verbose:
        end_c: int = time.time()
        print("done in %.2fs" % (end_c - start_c))
        print("Found chromosomes:\n", chroms_available, end="\n\n")

    if len(chroms) == 1 and chroms[0] == 'ALL_CHROMS':
        chroms: List[str] = chroms_available
    else:
        # check if the given chromosome number is among those whose sequence
        # is in the reference file
        if (any(True for c in chroms if c not in chroms_available)):
            raise ValueError("Unknown chromosome given")

    # end if

    cwd: str = os.getcwd()

    cmd: str
    code: int

    # check if the VCF file has already been indexed with tabix
    if not tbiexist(vcf):
        msg = ''.join([
            "TBI file not found for ",
            vcf.split('/')[-1], ". Indexing the VCF file with tabix..."
        ])
        print(msg)
        cmd = 'tabix -p vcf {0}'.format(vcf)
        code = subprocess.call(cmd, shell=True)

        if code != 0:
            # tabix didn't work
            errmsg = ''.join([
                "\n\nERROR: an error occurred while executing ", cmd,
                ". Exiting"
            ])
            raise SubprocessError(errmsg)

    elif reindex:  # the user want to reindex the VCF file with tabix
        msg = ''.join(["Reindexing ", vcf.split('/')[-1], "...\n"])
        print(msg)

        # remove the existing TBI file
        cmd = "rm {0}".format(''.join([vcf, ".tbi"]))
        code = subprocess.call(cmd, shell=True)

        if code != 0:
            errmsg = ''.join([
                "\n\nERROR: an error occurred while executing ", cmd,
                ". Exiting"
            ])
            raise SubprocessError(errmsg)

        # reindex the VCF
        cmd = "tabix -p vcf {0}".format(vcf)
        code = subprocess.call(cmd, shell=True)

        if code != 0:
            # tabix didn't work
            errmsg = ''.join([
                "\n\nERROR: an error occurred while executing ", cmd,
                ". Exiting"
            ])
            raise SubprocessError(errmsg)
        # end if
    # end if

    # enter the output directory
    os.chdir(outdir)

    # build the VG for each chromosome or a user defined
    # subset of them
    for chrom_n in chroms:

        chrom: str = ''.join(['chr', chrom_n])
        vg: str = ''.join([".", chrom, '.vg'])

        # build the VG for the current chromosome
        if verbose:
            start_build: float = time.time()

        code = build_vg(vg, reference, vcf, chrom_n, threads)
        if code != 0:
            msg = '\n\nERROR: an error occurred during {0} construction. '.format(
                vg)
            msg += 'Unable to build the VG of the genome using {0} and {1}'.format(
                reference, vcf)
            raise VGException(msg)
        # end if

        if verbose:
            end_build: float = time.time()
            msg = "Elapsed time to build {0} ".format(vg)
            print(msg, "%.2fs" % (end_build - start_build))
        # end if

        # to query efficiently the VGs we index them (VG -> XG)
        if verbose:
            start_index: float = time.time()

        msg = ''.join(["Indexing ", vg, ' and building the GBWT index...'])
        print(msg)

        code = indexVG(vg, vcf, threads, verbose)

        if code != 0:
            errmsg = "\n\nERROR: an error occurred while indexing {0}.".format(
                vg)
            errmsg += "\nUnable to index {0}. Exiting".format(vg)
            raise VGException(errmsg)
        # end if

        if verbose:
            end_index: float = time.time()
            msg = "Elapsed time to index {0}".format(vg)
            print(msg, "%.2fs" % (end_index - start_index))
        # end if

        # The majority of applications work only with indexed graph,
        # so to save disk space is worth to delete the VGs and keep
        # only the XGs (is simple to get back using VG built-in functions)
        if verbose:
            print("Deleting {0}".format(vg))

        cmd = 'rm {0}'.format(vg)
        subprocess.call(cmd, shell=True)

        if code != 0:  # we have errors in the vg indexing
            errmsg = ''.join([
                "\n\nERROR: an error occurred while executing ", cmd,
                ". Exiting"
            ])
            raise SubprocessError()
        # end if
    # end for

    # get the VGs location
    graphs_loc: str = os.getcwd()

    # return to the original working directory
    os.chdir(cwd)
Ejemplo n.º 10
0
def construct_vg(buildvg_args):
    """
        Create the genome graph, for the reference and VCF file given
        in input by the user.

        The genome is not built as a single whole genome graph but a
        single graph is constructed for each chromosome.
        This choice was made to avoid memory issues and make able
        also the less powerful machines to run GRAFIMO.

        There is NO drawback using this approach wrt
        construct the whole genome graph and query it.
        ----
        Parameters:
            chroms (list) : list of chromosomes for whicgh the genome
                            graph will be constructed
            linear_genome (str) : path to the linear genome used as
                                    reference to build the genome
                                    graphs
            vcf (str) : path to the VCF file used to build the genome
                        graphs
        ----
        Return:
            None
    """

    if not isinstance(buildvg_args, BuildVG):
        raise ValueError("Unknown arguments object type. Cannot Build the genome variation graph. Exiting")
        die(1)

    # read the arguments to build the VGs
    chroms = buildvg_args.get_chroms()
    threads = buildvg_args.get_cores()
    outdir = buildvg_args.get_outdir()
    verbose = buildvg_args.get_verbose()
    test = buildvg_args.get_test()

    if test:
        reference = get_reference_genome_from_ucsc()
        vcf = get_1000GProject_vcf()

    else:
        reference = buildvg_args.get_reference_genome()
        vcf = buildvg_args.get_vcf()
    # end if

    if verbose:
        print("using reference genome: ", reference)
        print("Using VCF file: ", vcf, "\n\n")
    # end if

    cwd = os.getcwd()

    # check if the VCF file has already been indexed with tabix
    if not tbiexist(vcf):
        msg = ''.join(["TBI file not found for ", vcf.split('/')[-1], ". Indexing the VCF file with tabix..."])
        print(msg)
        cmd = 'tabix -p vcf {0}'.format(vcf)
        code = subprocess.call(cmd, shell=True)

        if code != 0:
            # tabix didn't work
            errmsg = ''.join(["\n\nERROR: an error occurred while executing ", cmd, ". Exiting"])
            raise SubprocessError(errmsg)
            die(1)

    else:  # update the indexed VCF
        msg = ''.join(["Reindexing ", vcf.split('/')[-1], "..."])
        print(msg)

        # remove the existing TBI file
        cmd = "rm {0}".format(''.join([vcf, ".tbi"]))
        code = subprocess.call(cmd, shell=True)

        if code != 0:
            errmsg = ''.join(["\n\nERROR: an error occurred while executing ", cmd, ". Exiting"])
            raise SubprocessError(errmsg)
            die(1)

        # reindex the VCF
        cmd = "tabix -p vcf {0}".format(vcf)
        code = subprocess.call(cmd, shell=True)

        if code != 0:
            # tabix didn't work
            errmsg = ''.join(["\n\nERROR: an error occurred while executing ", cmd, ". Exiting"])
            raise SubprocessError(errmsg)
            die(1)
        # end if
    # end if

    # enter the output directory
    os.chdir(outdir)

    # build the VG for each chromosome or a user defined
    # subset of them
    for chrom_n in chroms:

        chrom = ''.join(['chr', chrom_n])  # to call vg construct we need both the
                                           # chromosome number and it preceded by 'chr

        vg = chrom + '.vg'

        # build the VG for the current chromosome
        if verbose:
            start_build = time.time()

        code = build_vg(vg, reference, vcf, chrom, chrom_n, threads)
        if code != 0:
            msg = '\n\nERROR: an error occurred during {0} construction. '.format(vg)
            msg += 'Unable to build the VG of the genome using {0} and {1}'.format(reference,
                                                                                   vcf)
            raise VGException(msg)
            die(1)
        # end if

        if verbose:
            end_build = time.time()
            msg = "Elapsed time to build {0} ".format(vg)
            msg = ''.join([msg, str(end_build - start_build), "s"])
            print(msg)
        # end if

        # to query efficiently the VGs we index them (VG -> XG)
        if verbose:
            start_index = time.time()

        msg = ''.join(["Indexing ", vg, '...'])
        print(msg)

        code = indexVG(vg, threads)

        if code != 0:
            errmsg = "\n\nERROR: an error occurred during indexing {0}.\nUnable to index {0}. Exiting".format(vg)
            raise VGException(errmsg)
            die(1)
        # end if

        if verbose:
            end_index = time.time()
            msg = "Elapsed time to index {0} ".format(vg)
            msg = ''.join([msg, str(end_index - start_index), "s"])
            print(msg)
        # end if

        # The majority of applications work only with indexed graph,
        # so to save disk space is worth to delete the VGs and keep
        # only the XGs (is simple to get back using VG built-in functions)
        if verbose:
            print("Deleting {0}".format(vg))

        cmd = 'rm {0}'.format(vg)
        subprocess.call(cmd, shell=True)

        if code != 0:  # we have errors in the vg indexing
            errmsg = ''.join(["\n\nERROR: an error occurred while executing ", cmd, ". Exiting"])
            raise SubprocessError()
            die(1)
        # end if
    # end for

    # get the VGs location
    graphs_loc = os.getcwd()

    # return to the original working directory
    os.chdir(cwd)
Ejemplo n.º 11
0
def get_regions(motif, args_obj):
    """
        Compute all sequences of length L (L is the
        motif width) from the VG(s).
        The sequences are extracted from the regions defined
        in the input BED file.
        ----
        Parameters:
            motif (Motif) : motif to search on the VG
            args_obj (Findmotif) : object storing the arguments
                                    required to extract the
                                    regions defined in the BED
                                    file, from the VG(s)
        ----
        Return:
            sequence_loc (str) : location of the tmp files,
                                    containing the extracted
                                    sequences
    """

    # check the input arguments
    if not isinstance(motif, Motif):
        errmsg = "\n\nERROR: unknown motif object type"
        raise ValueError(errmsg)

    if args_obj.has_graph_genome():
        vg = args_obj.get_graph_genome()

        if not isGraph_genome_xg(vg):
            errmsg = "\n\nERROR: the genome variation graph is not in XG format"
            raise VGException(errmsg)
        # end if

    elif args_obj.has_graph_genome_dir():
        vg = args_obj.get_graph_genome_dir()

    else:
        raise VGException("\n\nERROR: the genome variation graph is missing")
    # end if

    bedfile = args_obj.get_bedfile()
    motif_width = motif.getWidth()
    chroms = args_obj.get_chroms()
    cores = args_obj.get_cores()

    global verbose
    verbose = args_obj.get_verbose()

    print("\nExtracting regions defined in", bedfile, "\n")

    # read the regions where search the motif occurrences from the given BED file
    regions = getBEDregions(bedfile)

    if verbose:
        print("\nFound", len(regions), "regions in", bedfile)

    if chroms:
        # user defined subset of the chromosomes
        chr_list = [''.join(['chr', c]) for c in chroms]
    else:
        # all the chromosomes
        chr_list = [''.join(['chr', c]) for c in CHROMS_LIST]
    # end if

    # create a tmp working directory
    tmpwd = tempfile.mkdtemp(prefix='grafimo_')

    # if the tmp directory name already exists remove it
    # this shouldn't happen, but to be sure
    if os.path.isdir(tmpwd):
        cmd = 'rm -rf {0}'.format(tmpwd)
        code = subprocess.call(cmd, shell=True)

        if code != 0:
            raise SubprocessError(' '.join(
                ["an error occurred executing", cmd, ". Exiting"]))
    # end if

    cmd = 'mkdir -p {0}'.format(tmpwd)
    code = subprocess.call(cmd, shell=True)
    if code != 0:
        raise SubprocessError(' '.join(
            ["an error occurred executing", cmd, ". Exiting"]))

    # get the new location of graphs wrt the tmp dir
    cwd = os.getcwd()

    # enter the tmp dir where store the extracted sequences
    os.chdir(tmpwd)

    if verbose:
        start_re = time.time()

    # redefine default SIGINT handler
    original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
    pool = mp.Pool(processes=cores)  # use #cores processes
    signal.signal(signal.SIGINT, original_sigint_handler
                  )  # overwrite the default SIGINT handler to exit gracefully
    # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python

    if args_obj.has_graph_genome_dir():

        # vg -> directory containing a set of VGs
        if vg[-1] == "/":
            pass
        else:
            vg = ''.join([vg, "/"])
        # end if

        queries = []  # set of queries

        for region in regions:
            chrom = region['chr']
            start = region['start']
            stop = region['stop']

            if chrom in chr_list:

                # the chromosome is among the ones to query
                region_index = ''.join(
                    [chrom, ':', str(start), '-',
                     str(stop)])
                region_name = ''.join([chrom, '_', str(start), '-', str(stop)])
                seqs = correct_path('./', region_name, '.tsv')

                xg = ''.join([vg, chrom, '.xg'])

                if not os.path.exists(xg):
                    errmsg = ''.join(
                        ["\n\nERROR: unable to use ", xg, ". Exiting"])
                    raise FileNotFoundError(errmsg)

                query = 'vg find -x {0} -E -p {1} -K {2} > {3}'.format(
                    xg, region_index, motif_width, seqs)
                queries.append(query)

        # extract regions
        try:

            # query the VGs
            res = (pool.map_async(get_seqs, queries))

            if not verbose:
                it = 0
                while (True):
                    if res.ready():
                        # when finished call for the last time printProgressBar()
                        printProgressBar(tot,
                                         tot,
                                         prefix='Progress:',
                                         suffix='Complete',
                                         length=50)
                        break
                    # end if
                    if it == 0:
                        tot = res._number_left

                    remaining = res._number_left
                    printProgressBar((tot - remaining),
                                     tot,
                                     prefix='Progress:',
                                     suffix='Complete',
                                     length=50)
                    time.sleep(2)
                    it += 1
                # end while
            # end if

            ret = res.get(60 * 60 * 60)  # does not ignore signals

        except KeyboardInterrupt:
            pool.terminate()
            sigint_handler()

        else:
            pool.close()

            if verbose:
                end_re = time.time()
                msg = ''.join([
                    "Extracted all regions from VGs stored in ", vg, ", in ",
                    str(end_re - start_re), "s"
                ])
                print(msg)
            # end if
        # end try

    elif args_obj.has_graph_genome():

        queries = []  # set of queries

        for region in regions:
            chrom = region['chr']
            start = region['start']
            stop = region['stop']

            if chrom in chr_list:

                # the chromosome is among the ones to query
                region_index = ''.join(
                    [chrom, ':', str(start), '-',
                     str(stop)])
                region_name = ''.join([chrom, '_', str(start), '-', str(stop)])
                seqs = correct_path('./', region_name, '.tsv')

                if not os.path.exists(vg):
                    errmsg = ''.join(
                        ["\n\nERROR: unable to use ", vg, ". Exiting"])
                    raise FileNotFoundError(errmsg)

                query = 'vg find -x {0} -E -p {1} -K {2} > {3}'.format(
                    vg, region_index, motif_width, seqs)
                queries.append(query)

        # extract regions
        try:

            # query the VGs
            res = (pool.map_async(get_seqs, queries))

            if not verbose:
                it = 0
                while (True):
                    if res.ready():
                        # when finished call for the last time printProgressBar()
                        printProgressBar(tot,
                                         tot,
                                         prefix='Progress:',
                                         suffix='Complete',
                                         length=50)
                        break
                    # end if
                    if it == 0:
                        tot = res._number_left

                    remaining = res._number_left
                    printProgressBar((tot - remaining),
                                     tot,
                                     prefix='Progress:',
                                     suffix='Complete',
                                     length=50)
                    time.sleep(2)
                    it += 1
                # end while
            # end if

            ret = res.get(60 * 60 * 60)  # does not ignore signals

        except KeyboardInterrupt:
            pool.terminate()
            sigint_handler()

        else:
            pool.close()

            if verbose:
                end_re = time.time()
                msg = ''.join([
                    "Extracted all regions from VGs stored in ", vg, ", in ",
                    str(end_re - start_re), "s"
                ])
                print(msg)
            # end if
        # end try

    else:
        raise Exception("\n\nERROR: do not know how to proceed".Exiting)
    # end if

    sequence_loc = os.getcwd()  # the extracted sequences are store in the cwd
    os.chdir(cwd)  # get back to the origin

    return sequence_loc