if args.html:
        with open(args.html, "w") as htmlOut:
            print >> htmlOut, etree.tostring(html, pretty_print=True)

    # Finishing script
    logger.info(u"Count Digits Complete!")


if __name__ == '__main__':
    # Command line options
    args = getOptions()

    # Setting logger
    logger = logging.getLogger()
    if args.debug:
        sl.setLogger(logger, logLevel='debug')
    else:
        sl.setLogger(logger)

    # Starting script with the following parameters
    logger.info(u"Importing data with following parameters: "\
                "\n\tWide: {0}"\
                "\n\tDesign: {1}"\
                "\n\tUnique ID: {2}"\
                "\n\tGroup: {3}"\
                "\n\tHtml: {4}".\
    format(args.input,args.design, args.uniqID, args.group, args.html))

    # Main
    main(args)
Ejemplo n.º 2
0
            # Iterating over groups
            for name, group in dat.design.groupby(args.group):
                logger.info(u"Plotting for group {0}".format(name))

                # Plotting Density and Box plot for the group
                plotDensity(data=wide.T[group.index], name=name, pdf=pdf)

        # Get colors for each feature for "All groups"
        logger.info(u"Plotting for group {0}".format("samples"))
        palette.getColors(design=dat.design, groups=[])

        # Plotting density and boxplots for all
        plotDensity(data=wide, name="samples", pdf=pdf)

        #Ending script
        logger.info(u"Ending script")


if __name__ == '__main__':
    args = getOptions()
    logger = logging.getLogger()
    sl.setLogger(logger)
    logger.info("Importing data with following parameters: "\
            "\n\tWide: {0}"\
            "\n\tDesign: {1}"\
            "\n\tUnique ID: {2}".format(args.input, args.design, args.uniqID))
    palette = colorHandler(pal=args.palette, col=args.color)
    logger.info(u"Using {0} color scheme from {1} palette".format(
        args.color, args.palette))
    main(args)
Ejemplo n.º 3
0
def main():
    """
    Take a gene expression matrix and extract the column with ENSEMBL IDs. Then, translate ENSEMBL
    IDs into Gene_Symbol needed for the rest of the pipeline. Create a table with Unique
    Identifiers, ENSEMBL IDs, gene symbols, match scores, and selection resul, which is useful in
    cases of multiple matches.

    Arguments:
        :param species: Species to download information from mygene
        :type species: string

        :param geneAnnot: Gene Expression Annotation file with ENSEMBL IDs column
        :type geneAnnot: file

        :param ensemblId: Name of the column with ENSEMBL IDs
        :type ensemblId: string
    """

    args = getOptions()
    logger = logging.getLogger()
    sl.setLogger(logger)
    logger.info(
        u"""Importing data with following parameters: \
        \n\tSpecies: {0}\
        \n\tGene Annotation File: {1}\
        \n\tUnique ID column: {2}\
        \n\tENSEMBL ID Column: {3}""".format(
            args.species, args.geneAnnot, args.uniqId, args.ensemblId
        )
    )
    modules.checkForDuplicates(args.geneAnnot, args.uniqId)

    # Original Gene Expression Annotation Dataset with ENSEMBL IDs
    genesTable = pd.read_table(args.geneAnnot, delimiter="\t", header=0)

    # Find Gene Symbol
    mg = mygene.MyGeneInfo()
    genes = genesTable[args.ensemblId].tolist()
    genesTransformed = mg.querymany(
        genes,
        scopes="ensembl.gene",
        fields="symbol",
        species=args.species,
        verbose=False,
        returnall=True,
        as_dataframe=True,
        df_index=False,
    )
    genesTransformedTable = genesTransformed["out"]
    if genesTransformedTable.shape[0] != genesTransformed["missing"].shape[0]:
        genesTransformedTable.drop(labels=["_id"], axis=1, inplace=True)
        genesTransformedTable = genesTransformedTable[["query", "symbol", "_score"]]
        genesTransformedTable.columns = [args.ensemblId, "GeneSymbol", "Score"]

        # Merge Both datasets
        newGenesTable = pd.merge(genesTable, genesTransformedTable, on=args.ensemblId)

        # In case of duplicated, select the first one (High score)
        newGenesTable["Selected"] = "Yes"
        isDup = newGenesTable.duplicated(subset=args.ensemblId, keep="first")
        newGenesTable["Selected"][isDup] = "No"

        # Write table
        newGenesTable.to_csv(args.output, sep="\t", index=False)
    else:
        with open(args.output, 'w') as f:
            f.write("no matching result! Please check the selected species and input files.")
Ejemplo n.º 4
0
def main():
    """
    Add binary flags (0/1) to a differential expression dataset depending on p-value thresholds.

    Arguments:
        :param deaDataset: Matrix with Differential Expression Analysis information
        :type deaDataset: file

        :param pvalue: Name of the column with the p-value information
        :type pvalue: string

        :param uniqid: Name of the column with the unique identifier
        :type uniqid: string

        :param thresholds: Desired flag thresholds. Must be separed with ",", no spaces allowed.
        :type thresholds: string

    Returns:
        :return output: Table with input and added correspondent flags columns
        :rtype output: file

        :return flags: Table with only the correspondent flags columns
        :rtype flags: file
    """
    args = getOptions()
    logger = logging.getLogger()
    sl.setLogger(logger)
    logger.info(
        u"""Importing data with following parameters: \
        \n\tDEA Dataset: {0}\
        \n\tUnique ID: {1}\
        \n\tPvalues: {2}\
        \n\tThresholds: {3}""".format(
            args.deaDataset, args.uniqID, args.pvalue, args.thresholds
        )
    )

    modules.checkForDuplicates(args.deaDataset, args.uniqID)

    output = open(args.output, "w")
    flags = open(args.flags, "w")

    with open(args.deaDataset, "r") as data:
        header = data.readline().strip().split("\t")

    thresholds = args.thresholds.split(",")

    header_list = []
    for word in header:
        if word == "":
            output.write("NA")
            header_list.append("NA")
        elif header.index(word) == len(header) - 1:
            word = word.replace('"', "")
            output.write(word)
            header_list.append(word)
        else:
            word = word.replace('"', "")
            output.write(word + "\t")
            header_list.append(word)

    flags.write(str(args.uniqID))
    for threshold in thresholds:
        flags.write("\tFlag_" + threshold)
        output.write("\tFlag_" + threshold)
        header_list.append("\tFlag_" + threshold)

    flags.write("\n")
    output.write("\n")
    # Get P value column from a DEA dataset
    deaTable = genfromtxt(
        args.deaDataset,
        delimiter="\t",
        usecols=header_list.index(args.pvalue),
        dtype=None,
    )
    deaTable = np.delete(deaTable, 0)

    # Add 1/0 if smaller/greater than threshold
    i = 2
    for pvalue in deaTable:
        line = linecache.getline(args.deaDataset, i).strip()
        pvalue = float(pvalue.strip())
        flags.write(line.split("\t")[header_list.index(args.uniqID)])
        output.write(line)
        for threshold in thresholds:
            if pvalue <= float(threshold):
                flags.write("\t1")
                output.write("\t1")
            else:
                flags.write("\t0")
                output.write("\t0")
        flags.write("\n")
        output.write("\n")
        i += 1

    return args
Ejemplo n.º 5
0
    ror_df.to_csv(args.table,
                  sep="\t",
                  float_format="%.4f",
                  index_label=args.uniqID,
                  columns=["pval", "rsq", "slope"])
    ror_flags.df_flags.to_csv(args.flags, sep="\t", index_label=args.uniqID)


if __name__ == "__main__":
    # Command line options
    args = getOptions()

    # Setting up logger
    logger = logging.getLogger()
    if args.debug:
        sl.setLogger(logger, logLevel="debug")
        DEBUG = True
    else:
        sl.setLogger(logger)

    # Print logger info
    logger.info(u"""Importing data with following parameters: 
            \tWide: {0}
            \tDesign: {1}
            \tUnique ID: {2}
            \tGroup: {3}
            \tRun Order: {4}
            \tLevels: {5}
            """.format(args.input, args.design, args.uniqID, args.group,
                       args.order, args.levels))
Ejemplo n.º 6
0
def main():
    """
    Perform a correlation analysis of a Gene Expression Dataset and a Metabolomic Dataset.

    Arguments:
        :param geneDataset metDataset: Gene expression/Metabolomics wide dataset, respectively.
        :type geneDataset metDataset: files

        :param geneId metId: Name of the Genes/metabolites unique identifier column, respectively.
        :type geneId metId: strings

        :param geneAnnot metAnnot: Gene Expression/Metabolomics Annotation Datasets, respectively.
        :type geneAnnot metAnnot: files

        :param geneAnnotName metAnnotName: Name of the column of the Annotation file that contains
        genes/metabolites names respectively.
        :type geneAnnotName metAnnotName: strings

        :param meth: Methodology for the correlation function. One of 'pearson', 'spearman' or
        'kendall'.
        :type meth: string

        :param thres: PValue Threshold to cut the correlations for the output table.
        :type thres: float

    Returns:
        :return output: Output table with the following information: Metabolite "\t" Gene "\t"
        Correlation "\t" pvalue

        :rtype output: file

        :return corMat: Correlation Matrix
        :rtype corMat: file

        :return fig: Network-like output figure
        :rtype fig: pdf
    """

    warnings.filterwarnings("ignore", category=RRuntimeWarning)
    args = getOptions()
    logger = logging.getLogger()
    sl.setLogger(logger)
    logger.info(u"Importing data with the following parameters: "
                "\n\tGene Dataset:  {}"
                "\n\tGene UniqueID:  {}"
                "\n\tMet Dataset:{}"
                "\n\tMet UniqueID:  {}"
                "\n\tMethod:  {}"
                "\n\tThreshold:  {}".format(
                    args.geneDataset,
                    args.geneId,
                    args.metDataset,
                    args.metId,
                    args.meth,
                    args.thres,
                ))

    modules.checkForDuplicates(args.geneDataset, args.geneId)
    modules.checkForDuplicates(args.metDataset, args.metId)
    pandas2ri.activate()
    with ires.path("gaitGM.data",
                   "all_by_all_correlation.R") as my_r_script_path:
        f = open(my_r_script_path, "r")
        rFile = f.read()
    allByAllCorrScript = STAP(rFile, "corr_main_func")
    # Prepare Gene Expression Data
    geneTable = pd.read_table(args.geneDataset, sep="\t", header=0)
    if args.geneAnnot:
        R_gene_df = modules.Ids2Names(geneTable, args.geneId, args.geneAnnot,
                                      args.geneName)
    else:

        geneTable = geneTable.set_index(args.geneId)
        R_gene_df = pandas2ri.py2rpy(geneTable)

    # Prepare Metabolomics Data
    metTable = pd.read_table(args.metDataset, sep="\t", header=0)
    if args.metAnnot:
        R_met_df = modules.Ids2Names(metTable, args.metId, args.metAnnot,
                                     args.metName)
    else:
        metTable = metTable.set_index(args.metId)
        R_met_df = pandas2ri.py2rpy(metTable)

    allByAllCorrScript.corr_main_func(
        x=R_gene_df,
        y=R_met_df,
        meth=args.meth,
        thres=args.thres,
        corrMatPath=args.corMat,
        outputPath=args.output,
        figurePath=args.fig,
    )
Ejemplo n.º 7
0
def main():
    """
    Performs a Sparse Partial Least Squares (sPLS) analysis over subsets of gene expression and
    metabolomic data. To perform this subsetting, three different methodologies can be used for the
    metabolites:
    - By generic metabolite (sphingomyelin, ...)
    - By MMC cluster
    - By generic metabolite and then by MMC cluster
    and four for the genes:
    - All the genes
    - Genes contained in a list with interesting genes for the analysis
    - Pathway related genes for an specific generic metabolite
    - Metagenes (PANA approach)

    The outputs depend on the inputs.

    Arguments:
        :param geneDataset metDataset: Gene expression/Metabolomics wide dataset, respectively.
        :type geneDataset metDataset: files

        :param geneId metId: Name of the Genes/metabolites unique identifier column, respectively.
        :type geneId metId: strings

        :param geneAnnot metAnnot: Gene Expression/Metabolomics Annotation Dataset.
        :type geneAnnot metAnnot: files

        :param geneAnnotName metAnnotName: annotation file column with gene/metabolite names.
        :type geneAnnotName metAnnotName: strings

        :param design: Design File
        :type design: file

        :param keepX: Number of genes to keep in the sPLS model
        :param keepX: integer

        :param geneOption metOption: Options for metabolite subsetting (one of 'generic', 'mmc' or
        'both') and for gene expression subsetting (one of 'all', 'geneList', 'path' or 'pana')
        :type geneOption metOption: strings

        :param geneKeggAnno metKeggAnno: KEGG Annotation files for gene expression and
        metabolomics, respectively. From Add KEGG Anno Info Tool
        :type geneKeggAnno metKeggAnno: files

        :param geneKeggPath metKeggPath: KEGG Pathway files for gene expression and metabolomics,
        respectively. From Add KEGG Pathway Info Tool
        :type geneKeggPath metKeggPath: files

        :param path2genes: Downloaded KEGG file with this information: pathway_ID "\t" geneKEGG_ID
        :type path2genes: file

    Returns:
        :return figure1: sPLS heatmaps
        :rtype figure1: pdf

        :return splsOut: sif-like correlation matrix including a column describing the comparison.
        :rtype splsOut: file

        :return figure2: MMC plots if mmc or both metabolite subsetting option is selected.
        :rtype figure2: pdf

        :return mmcOut: MMC Output table if mmc or both metabolite subsetting option is selected.
        :rtype mmcOut: file

        :return panaOut: Table describing genes that forms the metagenes (1/0)
        :rtype panaOut: file
    """
    args = getOptions()
    logger = logging.getLogger()
    sl.setLogger(logger)
    logger.info(u"Importing data with the following parameters: "
                "\n\tGene Dataset:  {}"
                "\n\tGene UniqueID:  {}"
                "\n\tGene Option: {}"
                "\n\tMetabolite Dataset:{}"
                "\n\tMetabolite UniqueID:  {}"
                "\n\tMetabolite Option: {}".format(
                    args.geneDataset,
                    args.geneId,
                    args.geneOption,
                    args.metDataset,
                    args.metId,
                    args.metOption,
                ))
    pandas2ri.activate()
    with ires.path("gaitGM.data", "sPLS.R") as my_r_script_path:
        f = open(my_r_script_path, "r")
        rFile = f.read()
    sPLSScript = STAP(rFile, "sPLS")
    rGeneData, rMetData, multipleNames, multipleNamesId = modules.prepareSPLSData(
        args)
    rData = []
    data_counter = 0
    for R_met_df in rMetData:
        R_gene_df = rGeneData[data_counter]
        rData.append(
            sPLSScript.sPLS(geneData=R_gene_df,
                            metData=R_met_df,
                            keepX=args.keepX))
        if args.geneOption == "path":
            data_counter += 1
    if args.metOption == "both":
        sPLSScript.plotInPdf(splsObjects=rData,
                             figurePath=args.figure1,
                             multipleNames=multipleNamesId)
        # Correlation Matrix
        corMatrix = sPLSScript.corrMat(splsObjects=rData,
                                       multipleNames=multipleNamesId,
                                       threshold=args.thres)
        robjects.r["write.table"](
            corMatrix,
            file=args.splsOut,
            sep="\t",
            quote=False,
            row_names=False,
            col_names=True,
        )
    else:
        sPLSScript.plotInPdf(splsObjects=rData,
                             figurePath=args.figure1,
                             multipleNames=multipleNames)
        # Correlation Matrix
        corMatrix = sPLSScript.corrMat(splsObjects=rData,
                                       multipleNames=multipleNames,
                                       threshold=args.thres)
        robjects.r["write.table"](
            corMatrix,
            file=args.splsOut,
            sep="\t",
            quote=False,
            row_names=False,
            col_names=True,
        )