Esempi in Python per Pipeline.load, esempi in Python per CGATPipelines.Pipeline.load

Esempio n. 1

0

Mostra file

File: pipeline_exome_cancer.py Progetto: gjaime/CGATPipelines

def loadMutectExtendedOutput(infile, outfile):
    '''Load mutect extended output into database'''

    infile = infile.replace(".mutect.snp.vcf", "_call_stats.out")

    indices = "contig,position"
    P.load(infile, outfile, options="--add-index=%(indices)s" % locals())

Esempio n. 2

0

Mostra file

File: pipeline_promotors.py Progetto: CGATOxford/CGATPipelines

def loadTranscriptSummary(infile, outfile):
    '''summarize binding information per transcript.'''

    dbh = connect()

    table = P.toTable(outfile)

    cc = dbh.cursor()
    # sqlite can not do full outer join
    cc.execute( """DROP TABLE IF EXISTS %(table)s""" % locals() )

    transcripts = [x[0] for x in cc.execute(
        "SELECT DISTINCT(transcript_id) FROM annotations.transcript_info").fetchall()]

    tmpf = P.getTempFile()

    tables = ("tata", "cpg")
    titles = tables

    vals = []
    for table in tables:
        t = set([x[0] for x in cc.execute(
            "SELECT DISTINCT(transcript_id) FROM %(table)s" % locals()).fetchall()])
        vals.append(t)

    tmpf.write("transcript_id\t%s\n" % "\t".join(titles))

    for transcript_id in transcripts:
        tmpf.write("%s\t%s\n" % (transcript_id,
                                 "\t".join([str(int(transcript_id in v)) for v in vals])))

    tmpf.close()

    P.load(tmpf.name, outfile)
    os.unlink(tmpf.name)

Esempio n. 3

0

Mostra file

File: PipelineTranscriptDiffExpression.py Progetto: gjaime/CGATPipelines

def loadSleuthTable(infile, outfile, transcript_info, gene_biotypes,
                    database, annotations_database):

        tmpfile = P.getTempFilename("/ifs/scratch/")

        table = os.path.basename(transcript_info)

        if gene_biotypes:
            where_cmd = "WHERE " + " OR ".join(
                ["gene_biotype = '%s'" % x
                 for x in gene_biotypes.split(",")])
        else:
            where_cmd = ""

        select = """SELECT DISTINCT
        transcript_id, transcript_biotype, gene_id, gene_name
        FROM annotations.%(table)s
        %(where_cmd)s""" % locals()

        df1 = pd.read_table(infile, sep="\t")
        df1.set_index("transcript_id", drop=True, inplace=True)

        df2 = pd.read_sql(select, connect(database, annotations_database))
        df2.set_index("transcript_id", drop=False, inplace=True)

        df = df1.join(df2)
        df.to_csv(tmpfile, sep="\t", index=True)

        options = "--add-index=transcript_id"
        P.load(tmpfile, outfile, options=options)
        os.unlink(tmpfile)

Esempio n. 4

0

Mostra file

def loadNCG(outfile):
    '''Load NCG into database'''

    infile = PARAMS["cancergenes_table"]
    # infile = "/ifs/projects/proj053/backup/NCG/cancergenes2016.tsv"

    P.load(infile, outfile, options="--add-index=symbol")

Esempio n. 5

0

Mostra file

File: pipeline_bamstats.py Progetto: CGATOxford/CGATPipelines

def loadExonValidation(infiles, outfile):
    ''' load individual and merged exon validation stats

    For each sample, the exon validation stats are loaded into a table
    named by sample and mapper
    [sample]_[mapper]_overrun

    The merge alignment stats for all samples are merged and loaded
    into single table called exon_validation

    Parameters
    ----------
    infiles : list
       Input filenames with exon validation stats
    outfile : str
       Output filename
    '''

    suffix = ".exon.validation.tsv.gz"

    P.mergeAndLoad(infiles, outfile, suffix=suffix)
    for infile in infiles:
        track = P.snip(infile, suffix)
        o = "%s_overrun.load" % track
        P.load(infile + ".overrun.gz", o)

Esempio n. 6

0

Mostra file

File: PipelineTranscriptDiffExpression.py Progetto: dormeight/CGATPipelines

def loadSleuthTableGenes(infile, outfile, gene_info, gene_biotypes, database,
                         annotations_database):

    tmpfile = P.getTempFilename("/ifs/scratch/")

    table = os.path.basename(gene_info)

    if gene_biotypes:
        where_cmd = "WHERE " + " OR ".join(
            ["gene_biotype = '%s'" % x for x in gene_biotypes.split(",")])
    else:
        where_cmd = ""

    select = """SELECT DISTINCT
        gene_id, gene_name
        FROM annotations.%(table)s
        %(where_cmd)s""" % locals()

    df1 = pd.read_table(infile, sep="\t")
    df1.set_index("test_id", drop=False, inplace=True)

    df2 = pd.read_sql(select, connect(database, annotations_database))
    df2.set_index("gene_id", drop=False, inplace=True)

    df = df1.join(df2)
    df.to_csv(tmpfile, sep="\t", index=True)

    options = "--add-index=gene_id"
    P.load(tmpfile, outfile, options=options)
    os.unlink(tmpfile)

Esempio n. 7

0

Mostra file

File: pipeline_scrnaseqqc.py Progetto: CGATOxford/CGATPipelines

def loadQcMeasures(infile, outfile):
    '''
    load QC measures into CSVDB
    '''

    P.load(infile, outfile,
           options="--add-index=track")

Esempio n. 8

0

Mostra file

File: pipeline_scrnaseqqc.py Progetto: CGATOxford/CGATPipelines

def loadSailfishCounts(infile, outfile):
    '''
    load Sailfish gene counts data into
    CSVDB
    '''

    P.load(infile, outfile)

Esempio n. 9

0

Mostra file

def loadSampleInfo(infile, outfile):

    P.load(
        infile,
        outfile,
        options="--header-names=format,barcode,track,lanes -i barcode -i track"
    )

Esempio n. 10

0

Mostra file

def loadSailfishCounts(infile, outfile):
    '''
    load Sailfish gene counts data into
    CSVDB
    '''

    P.load(infile, outfile)

Esempio n. 11

0

Mostra file

File: pipeline_transcriptome.py Progetto: wbyu/CGATPipelines

def loadDistances(infile, outfile):
    '''load annotations'''
    P.load(
        infile, outfile,
        "--add-index=gene_id --map=gene_id:str --add-index=closest_id --map=closest_id:str"
    )
    table = outfile[:-len(".load")]

Esempio n. 12

0

Mostra file

File: PipelineBamStats.py Progetto: hmyh1202/CGATPipelines

def loadPicardCoverageStats(infiles, outfile):
    '''import coverage statistics into database.
    Arguments
    ---------
    infiles : string
        Filenames of files with picard metric information. Each file
        corresponds to a different track.
    outfile : string
        Logfile. The table name will be derived from `outfile`.
    '''

    outf = P.getTempFile(".")
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".cov")
        lines = [x for x in open(f, "r").readlines()
                 if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))
    outf.close()
    P.load(outf.name,
           outfile,
           options="--ignore-empty --add-index=track")
    os.unlink(outf.name)

Esempio n. 13

0

Mostra file

File: pipeline_scrnaseq.py Progetto: snsansom/scseq

def loadCuffNormClassic(infile, outfile):
    '''load the fpkm table from cuffnorm into the database'''

    fpkm_table = os.path.dirname(infile) + "/genes.fpkm_table"

    P.load(fpkm_table, outfile,
           options='-i "tracking_id"')

Esempio n. 14

0

Mostra file

File: pipeline_transfacmatch.py Progetto: gjaime/CGATPipelines

def loadGCContent(infile, outfile):
    '''
    load the results the GC content for each background
    and foreground
    '''
    P.load(infile, outfile,
           options="--add-index=id")

Esempio n. 15

0

Mostra file

File: pipeline_variant_annotation.py Progetto: Acribbs/CGATPipelines

def loadAnnotations(infile, outfile):
    '''load variant annotations into database'''

    P.load(infile, outfile,
           options="--map=gene_id:str "
           "--add-index=gene_id "
           "--map=base_qualities:text ")

Esempio n. 16

0

Mostra file

def loadSailfishTpm(infile, outfile):
    '''
    load Sailfish TPM estimates into
    CSVDB
    '''

    P.load(infile, outfile)

Esempio n. 17

0

Mostra file

File: pipeline_scrnaseqqc.py Progetto: CGATOxford/CGATPipelines

def loadSailfishTpm(infile, outfile):
    '''
    load Sailfish TPM estimates into
    CSVDB
    '''

    P.load(infile, outfile)

Esempio n. 18

0

Mostra file

File: pipeline_exome_cancer.py Progetto: gjaime/CGATPipelines

def loadNCG(outfile):
    '''Load NCG into database'''

    infile = PARAMS["cancergenes_table"]
    # infile = "/ifs/projects/proj053/backup/NCG/cancergenes2016.tsv"

    P.load(infile, outfile, options="--add-index=symbol")

Esempio n. 19

0

Mostra file

def loadExonValidation(infiles, outfile):
    ''' load individual and merged exon validation stats

    For each sample, the exon validation stats are loaded into a table
    named by sample and mapper
    [sample]_[mapper]_overrun

    The merge alignment stats for all samples are merged and loaded
    into single table called exon_validation

    Parameters
    ----------
    infiles : list
       Input filenames with exon validation stats
    outfile : str
       Output filename
    '''

    suffix = ".exon.validation.tsv.gz"

    P.mergeAndLoad(infiles, outfile, suffix=suffix)
    for infile in infiles:
        track = P.snip(infile, suffix)
        o = "%s_overrun.load" % track
        P.load(infile + ".overrun.gz", o)

Esempio n. 20

0

Mostra file

def loadMutectExtendedOutput(infile, outfile):
    '''Load mutect extended output into database'''

    infile = infile.replace(".mutect.snp.vcf", "_call_stats.out")

    indices = "contig,position"
    P.load(infile, outfile, options="--add-index=%(indices)s" % locals())

Esempio n. 21

0

Mostra file

File: pipeline_variant_annotation.py Progetto: CGATOxford/CGATPipelines

def loadAnnotations(infile, outfile):
    '''load variant annotations into database'''

    P.load(infile, outfile,
           options="--map=gene_id:str "
           "--add-index=gene_id "
           "--map=base_qualities:text ")

Esempio n. 22

0

Mostra file

File: PipelineMappingQC.py Progetto: CGATOxford/CGATPipelines

def loadIdxstats(infiles, outfile):
    '''take list of file paths to samtools idxstats output files
    and merge to create single dataframe containing mapped reads per
    contig for each track. This dataframe is then loaded into
    database.

    Loads tables into the database
        * idxstats_reads_per_chromosome

    Arguments
    ---------
    infiles : list
        list where each element is a string of the filename containing samtools
        idxstats output. Filename format is expected to be 'sample.idxstats'
    outfile : string
        Logfile. The table name will be derived from `outfile`.
    '''

    outf = P.getTempFile(".")
    dfs = []
    for f in infiles:
        track = P.snip(f, ".idxstats").split('/')[-1]

        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue

        # reformat idx stats
        df = pandas.read_csv(f, sep='\t', header=None)
        df.columns = ['region', 'length', 'mapped', 'unmapped']

        # calc total reads mapped & unmappedpep
        total_reads = df.unmapped.sum() + df.mapped.sum()
        total_mapped_reads = df.mapped.sum()

        reformatted_df = pandas.DataFrame([['total_mapped_reads', total_mapped_reads],
                                           ['total_reads', total_reads],
                                           ['track', track]], columns=(['region', 'mapped']))

        # reformat the df
        df = df.append(reformatted_df, ignore_index=True)
        df.set_index('region', inplace=True)
        df1 = df[['mapped']].T
        # set track as index
        df1.set_index('track', inplace=True)
        dfs.append(df1)

    # merge dataframes into single table
    master_df = pandas.concat(dfs)
    master_df.drop('*', axis=1, inplace=True)
    # transform dataframe to avoid reaching column limit
    master_df = master_df.T
    master_df.to_csv(outf, sep='\t', index=True)
    outf.close()

    P.load(outf.name,
           outfile,
           options="--ignore-empty --add-index=track")
    os.unlink(outf.name)

Esempio n. 23

0

Mostra file

File: pipeline_benchmark_rnaseqmappers.py Progetto: hainm/CGATPipelines

def loadExonValidation(infiles, outfile):
    """merge alignment stats into single tables."""
    suffix = suffix = ".exon.validation.tsv.gz"
    mergeAndLoad(infiles, outfile, suffix=suffix)
    for infile in infiles:
        track = P.snip(infile, suffix)
        o = "%s_overrun.load" % track
        P.load(infile + ".overrun.gz", o)

Esempio n. 24

0

Mostra file

File: pipeline_transfacmatch.py Progetto: gjaime/CGATPipelines

def loadEnrichmentOfTFBS(infile, outfile):
    '''
    load the results of the enrichment
    '''

    P.load(infile,
           outfile,
           options="--add-index=matrix_id")

Esempio n. 25

0

Mostra file

def loadExonValidation(infiles, outfile):
    '''merge alignment stats into single tables.'''
    suffix = suffix = ".exon.validation.tsv.gz"
    mergeAndLoad(infiles, outfile, suffix=suffix)
    for infile in infiles:
        track = P.snip(infile, suffix)
        o = "%s_overrun.load" % track
        P.load(infile + ".overrun.gz", o)

Esempio n. 26

0

Mostra file

File: pipeline_rnaseqqc.py Progetto: gjaime/CGATPipelines

def loadMetaInformation(infile, outfile):
    P.load(infile, outfile,
           options="--map=id:int "
           "--map=sample_id:int "
           "--map=experiment_id:int "
           "--add-index=id "
           "--add-index=experiment_id "
           "--add-index=sample_id ")

Esempio n. 27

0

Mostra file

def loadFimo(infile, outfile):

    P.load(
        infile,
        outfile,
        options=
        '-H "pattern_name,sequence_name,start,stop,strand,score,p_value,q_value,matched_sequence" '
    )

Esempio n. 28

0

Mostra file

File: PipelineMotifs.py Progetto: gjaime/CGATPipelines

def loadBioProspector(infile, outfile):
    '''load results from bioprospector.'''

    target_path = os.path.join(
        os.path.abspath(PARAMS["exportdir"]), "bioprospector")

    try:
        os.makedirs(target_path)
    except OSError:
        pass

    track = infile[:-len(".bioprospector")]

    results = Bioprospector.parse(IOTools.openFile(infile, "r"))

    tmpfile = P.getTempFile()
    tmpfile.write("id\tmotif\tstart\tend\tstrand\tarrangement\n")

    for x, motifs in enumerate(results):
        outname = os.path.join(target_path, "%s_%02i.png" % (track, x))
        Bioprospector.build_logo([y.sequence for y in motifs.matches],
                                 outname)

        for match in motifs.matches:

            distance = abs(
                match.start + match.width1 - (match.end - match.width2))

            if match.strand in ("+-", "-+"):
                arrangement = "ER"
            elif match.strand in ("++", "--"):
                arrangement = "DR"
            else:
                arrangement = "SM"
                distance = 0

            arrangement += "%i" % distance
            strand = match.strand[0]

            id = re.sub(".*_", "", match.id)
            tmpfile.write("%s\t%i\t%i\t%i\t%s\t%s\n" %
                          (id,
                           x,
                           match.start,
                           match.end,
                           strand,
                           arrangement))
    tmpfile.close()

    P.load(tmpfile.name,
           outfile,
           options="--add-index=id "
           "--add-index=motif "
           "--add-index=id,motif "
           "--allow-empty-file "
           "--map=base_qualities:text")

    os.unlink(tmpfile.name)

Esempio n. 29

0

Mostra file

File: PipelineDeNovoMotifs.py Progetto: growland1/research_project

def loadBioProspector(infile, outfile):
    '''load results from bioprospector.'''

    target_path = os.path.join(
        os.path.abspath(PARAMS["exportdir"]), "bioprospector")

    try:
        os.makedirs(target_path)
    except OSError:
        pass

    track = infile[:-len(".bioprospector")]

    results = Bioprospector.parse(IOTools.openFile(infile, "r"))

    tmpfile = P.getTempFile()
    tmpfile.write("id\tmotif\tstart\tend\tstrand\tarrangement\n")

    for x, motifs in enumerate(results):
        outname = os.path.join(target_path, "%s_%02i.png" % (track, x))
        Bioprospector.build_logo([y.sequence for y in motifs.matches],
                                 outname)

        for match in motifs.matches:

            distance = abs(
                match.start + match.width1 - (match.end - match.width2))

            if match.strand in ("+-", "-+"):
                arrangement = "ER"
            elif match.strand in ("++", "--"):
                arrangement = "DR"
            else:
                arrangement = "SM"
                distance = 0

            arrangement += "%i" % distance
            strand = match.strand[0]

            id = re.sub(".*_", "", match.id)
            tmpfile.write("%s\t%i\t%i\t%i\t%s\t%s\n" %
                          (id,
                           x,
                           match.start,
                           match.end,
                           strand,
                           arrangement))
    tmpfile.close()

    P.load(tmpfile.name,
           outfile,
           options="--add-index=id "
           "--add-index=motif "
           "--add-index=id,motif "
           "--allow-empty-file "
           "--map=base_qualities:text")

    os.unlink(tmpfile.name)

Esempio n. 30

0

Mostra file

File: PipelineMappingQC.py Progetto: wbyu/CGATPipelines

def loadIdxstats(infiles, outfile):
    '''take list of file paths to samtools idxstats output files
    and merge to create single dataframe containing mapped reads per
    contig for each track. This dataframe is then loaded into
    database.

    Loads tables into the database
        * idxstats_reads_per_chromosome

    Arguments
    ---------
    infiles : list
        list where each element is a string of the filename containing samtools
        idxstats output. Filename format is expected to be 'sample.idxstats'
    outfile : string
        Logfile. The table name will be derived from `outfile`.
    '''

    outf = P.getTempFile(".")
    dfs = []
    for f in infiles:
        track = P.snip(f, ".idxstats").split('/')[-1]

        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue

        # reformat idx stats
        df = pandas.read_csv(f, sep='\t', header=None)
        df.columns = ['region', 'length', 'mapped', 'unmapped']

        # calc total reads mapped & unmappedpep
        total_reads = df.unmapped.sum() + df.mapped.sum()
        total_mapped_reads = df.mapped.sum()

        reformatted_df = pandas.DataFrame(
            [['total_mapped_reads', total_mapped_reads],
             ['total_reads', total_reads], ['track', track]],
            columns=(['region', 'mapped']))

        # reformat the df
        df = df.append(reformatted_df, ignore_index=True)
        df.set_index('region', inplace=True)
        df1 = df[['mapped']].T
        # set track as index
        df1.set_index('track', inplace=True)
        dfs.append(df1)

    # merge dataframes into single table
    master_df = pandas.concat(dfs)
    master_df.drop('*', axis=1, inplace=True)
    # transform dataframe to avoid reaching column limit
    master_df = master_df.T
    master_df.to_csv(outf, sep='\t', index=True)
    outf.close()

    P.load(outf.name, outfile, options="--ignore-empty --add-index=track")
    os.unlink(outf.name)

Esempio n. 31

0

Mostra file

def loadTranscriptProfile(infiles,
                          outfile,
                          suffix="transcript_profile",
                          tablename=None):
    '''load transcript profiles into one table.
    Arguments
    ---------
    infiles : string
        Filenames of files with matrix from bam2geneprofile. Each file
        corresponds to a different track.
    outfile : string
        Logfile.
    suffix : string
        Suffix to append to table name.
    pipeline_suffix : string
        Suffix to remove from track name.
    tablename : string
        Tablename to use. If unset, the table name will be derived
        from `outfile` and suffix as ``toTable(outfile) + "_" +
        suffix``.
    '''

    if not tablename:
        tablename = "%s" % (suffix)

    outf = P.getTempFile(".")

    table_count = 0
    table_join = None

    for infile in infiles:

        matrix_file = str(
            infile
        ) + ".geneprofileabsolutedistancefromthreeprimeend.matrix.tsv.gz"
        name = P.snip(os.path.basename(infile), ".transcriptprofile.gz")

        table = pd.read_csv(matrix_file, sep="\t")
        table.rename(columns={'none': name}, inplace=True)
        table.drop(["area", "counts", "background"], axis=1, inplace=True)

        if table_count == 0:
            table_join = table
            table_count += 1
        else:
            table_join = table.merge(table_join,
                                     on=["bin", "region", "region_bin"],
                                     how="left")
    table_join.to_csv(outf, sep="\t", index=False)

    outf.close()

    P.load(infile=outf.name,
           outfile=outfile,
           tablename=tablename,
           options="--add-index=bin")

    os.unlink(outf.name)

Esempio n. 32

0

Mostra file

File: pipeline_rnaseqlncrna.py Progetto: gjaime/CGATPipelines

def loadCPCResults(infile, outfile):
    '''
    load the results of the cpc analysis
    '''

    P.load(infile,
           outfile,
           options="--header-names=transcript_id,feature,C_NC,CP_score "
           "--add-index=transcript_id")

Esempio n. 33

0

Mostra file

File: pipeline_retained_introns.py Progetto: sudlab/pipeline_retained_introns

def load_chunk_annotations(infile, outfile):

    P.load(infile, outfile, "-i gene_id -i exon_id")

    tablename = P.toTable(outfile)
    connect().executescript('''DROP INDEX IF EXISTS %(tablename)s_joint;
                               CREATE INDEX %(tablename)s_joint ON
                                   %(tablename)s(gene_id,exon_id)''' %
                            locals())

Esempio n. 34

0

Mostra file

File: pipeline_exome_cancer.py Progetto: gjaime/CGATPipelines

def loadVariantAnnotation(infile, outfile):
    '''Load VCF annotations into database'''

    if infile.endswith("indels.annotated.filtered.tsv"):
        indices = "CHROM,POS,SNPEFF_GENE_NAME"
    elif infile.endswith("mutect.snp.annotated.filtered.tsv"):
        indices = "CHROM,POS,SNPEFF_GENE_NAME"

    P.load(infile, outfile, options="--add-index=%(indices)s" % locals())

Esempio n. 35

0

Mostra file

def loadVariantAnnotation(infile, outfile):
    '''Load VCF annotations into database'''

    if infile.endswith("indels.annotated.filtered.tsv"):
        indices = "CHROM,POS,SNPEFF_GENE_NAME"
    elif infile.endswith("mutect.snp.annotated.filtered.tsv"):
        indices = "CHROM,POS,SNPEFF_GENE_NAME"

    P.load(infile, outfile, options="--add-index=%(indices)s" % locals())

Esempio n. 36

0

Mostra file

File: pipeline_rnaseqlncrna.py Progetto: Acribbs/CGATPipelines

def loadCPCResults(infile, outfile):
    '''
    load the results of the cpc analysis
    '''

    P.load(infile,
           outfile,
           options="--header-names=transcript_id,feature,C_NC,CP_score "
           "--add-index=transcript_id")

Esempio n. 37

0

Mostra file

File: pipeline_variant_annotation.py Progetto: CGATOxford/CGATPipelines

def loadPolyphenMap(infile, outfile):
    '''load polyphen input data.'''

    P.load(infile + ".map",
           outfile,
           options="--add-index=snp_id "
           "--add-index=track,transcript_id "
           "--add-index=contig,pos "
           "--add-index=protein_id "
           "--add-index=transcript_id ")

Esempio n. 38

0

Mostra file

File: pipeline_variant_annotation.py Progetto: Acribbs/CGATPipelines

def loadPolyphenMap(infile, outfile):
    '''load polyphen input data.'''

    P.load(infile + ".map",
           outfile,
           options="--add-index=snp_id "
           "--add-index=track,transcript_id "
           "--add-index=contig,pos "
           "--add-index=protein_id "
           "--add-index=transcript_id ")

Esempio n. 39

0

Mostra file

File: PipelineBamStats.py Progetto: CGATOxford/CGATPipelines

def loadTranscriptProfile(infiles, outfile,
                          suffix="transcript_profile",
                          tablename=None):
    '''load transcript profiles into one table.
    Arguments
    ---------
    infiles : string
        Filenames of files with matrix from bam2geneprofile. Each file
        corresponds to a different track.
    outfile : string
        Logfile.
    suffix : string
        Suffix to append to table name.
    pipeline_suffix : string
        Suffix to remove from track name.
    tablename : string
        Tablename to use. If unset, the table name will be derived
        from `outfile` and suffix as ``toTable(outfile) + "_" +
        suffix``.
    '''

    if not tablename:
        tablename = "%s" % (suffix)

    outf = P.getTempFile(".")

    table_count = 0
    table_join = None

    for infile in infiles:

        matrix_file = str(infile) + ".geneprofileabsolutedistancefromthreeprimeend.matrix.tsv.gz"
        name = P.snip(os.path.basename(infile), ".transcriptprofile.gz")

        table = pd.read_csv(matrix_file, sep="\t")
        table.rename(columns={'none': name}, inplace=True)
        table.drop(["area", "counts", "background"], axis=1, inplace=True)

        if table_count == 0:
            table_join = table
            table_count += 1
        else:
            table_join = table.merge(table_join,
                                     on=["bin", "region", "region_bin"],
                                     how="left")
    table_join.to_csv(outf, sep="\t", index=False)

    outf.close()

    P.load(infile=outf.name,
           outfile=outfile,
           tablename=tablename,
           options="--add-index=bin")

    os.unlink(outf.name)

Esempio n. 40

0

Mostra file

def build_db(infiles, outfile):
    '''
    Stores data generated throughout pipeline as a sqlite database.
    Structure of data tables and database is meant for compatibility
    with the shiny app
    '''

    # record merged_filter_summary, merged_qc_summary,
    # merged_taxonomy, merged_abundance_id
    # and yml table in database
    P.load(infiles, outfile)

Esempio n. 41

0

Mostra file

File: pipeline_splicing.py Progetto: wbyu/CGATPipelines

def loadPermuteMATS(infile, outfile):
    '''load rMATS permutation results

    Loads rMATS permutation summary results into relational database.

    Parameters
    ----------
    infile: file containing summary table of rMATS permutation results
    outfile: .load file
    '''

    P.load(infile, outfile)

Esempio n. 42

0

Mostra file

File: pipeline_splicing.py Progetto: wbyu/CGATPipelines

def loadCollateMATS(infile, outfile):
    '''load rMATS summary into relational database

    Loads rMATS summary results into relational database.

    Parameters
    ----------
    infile: file containing summary table of rMATS results
    outfile: .load file
    '''

    P.load(infile, outfile)

Esempio n. 43

0

Mostra file

File: pipeline_splicing.py Progetto: CGATOxford/CGATPipelines

def loadPermuteMATS(infile, outfile):
    '''load rMATS permutation results

    Loads rMATS permutation summary results into relational database.

    Parameters
    ----------
    infile: file containing summary table of rMATS permutation results
    outfile: .load file
    '''

    P.load(infile, outfile)

Esempio n. 44

0

Mostra file

File: pipeline_splicing.py Progetto: CGATOxford/CGATPipelines

def loadCollateMATS(infile, outfile):
    '''load rMATS summary into relational database

    Loads rMATS summary results into relational database.

    Parameters
    ----------
    infile: file containing summary table of rMATS results
    outfile: .load file
    '''

    P.load(infile, outfile)

Esempio n. 45

0

Mostra file

def loadCountReads(infiles,
                   outfile,
                   suffix="nreads",
                   pipeline_suffix=".nreads",
                   tablename=None):
    '''load read counts.
    Arguments
    ---------
    infiles : string
        Filenames of files with number of reads per sample. Each file
        corresponds to a different track.
    outfile : string
        Logfile.
    suffix : string
        Suffix to append to table name.
    pipeline_suffix : string
        Suffix to remove from track name.
    tablename : string
        Tablename to use. If unset, the table name will be derived
        from `outfile` and suffix as ``toTable(outfile) + "_" +
        suffix``.
    '''

    if not tablename:
        tablename = "%s_%s" % (P.toTable(outfile), suffix)

    outf = P.getTempFile(".")

    outf.write("%s\t%s\n" % ("track", "nreads"))

    for filename in infiles:
        track = P.snip(os.path.basename(filename), pipeline_suffix)

        if not os.path.exists(filename):
            E.warn("File %s missing" % filename)
            continue

        lines = IOTools.openFile(filename, "r").readlines()

        for line in lines:
            count = line.split("\t")[1]
            outf.write("%s\t%s\n" % (track, count))

    outf.close()

    P.load(infile=outf.name,
           outfile=outfile,
           tablename=tablename,
           options="--add-index=track")

    os.unlink(outf.name)

Esempio n. 46

0

Mostra file

def loadManualAnnotations(infile, outfile):

    tmp = P.getTempFilename(".")

    annotation = P.snip(infile, "_annotations.tsv")

    with IOTools.openFile(tmp, "w") as outf:
        outf.write("%s\tgene_id\n" % annotation)
        with IOTools.openFile(infile, "r") as inf:
            for line in inf:
                outf.write("%s\t%s" % (annotation, line))

    P.load(tmp, outfile, options="--add-index=gene_id")
    os.unlink(tmp)

Esempio n. 47

0

Mostra file

File: pipeline_exome_cancer.py Progetto: gjaime/CGATPipelines

def loadManualAnnotations(infile, outfile):

    tmp = P.getTempFilename(".")

    annotation = P.snip(infile, "_annotations.tsv")

    with IOTools.openFile(tmp, "w") as outf:
        outf.write("%s\tgene_id\n" % annotation)
        with IOTools.openFile(infile, "r") as inf:
            for line in inf:
                outf.write("%s\t%s" % (annotation, line))

    P.load(tmp, outfile, options="--add-index=gene_id")
    os.unlink(tmp)

Esempio n. 48

0

Mostra file

File: pipeline_variant_annotation.py Progetto: Acribbs/CGATPipelines

def mergeEffects(infiles, outfile):
    '''load transcript effects into single table.'''

    tablename = P.toTable(outfile)
    outf = open('effects.txt', 'w')
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".effects.gz")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [x for x in gzip.open(f, "r").readlines()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))
    outf.close()

    P.load("effect.txt",
           outfile,
           options="--add-index=transcript_id")

    for suffix in ("cds", "intron", "splicing", "translation", "genes"):

        outf = open('effects.' + suffix + '.txt', 'w')
        first = True
        for f in infiles:
            track = P.snip(os.path.basename(f), ".effects.gz")
            statfile = f + "." + suffix + ".gz"
            print(statfile)
            if not os.path.exists(statfile):
                E.warn("File %s missing" % statfile)
                continue
            lines = [x for x in gzip.open(statfile, "r").readlines()]
            if first:
                outf.write("%s\t%s" % ("track", lines[0]))
            first = False
            for i in range(1, len(lines)):
                outf.write("%s\t%s" % (track, lines[i]))
        outf.close()
        tmpfilename = outf.name

        P.load(outf.name,
               outfile,
               tablename=tabelname + "_" + suffix,
               options="--add-index=transcript_id "
               "--allow-empty-file "
               "--ignore-column=seq_na "
               "--ignore-column=seq_aa")

Esempio n. 49

0

Mostra file

def load_last_exon_chunks(infile, outfile):
    '''Load gene and exon_ids for last exons into database'''

    from CGAT import GTF

    with P.getTempFile(shared=True) as tmpfile:
        tmpfile.write("gene_id\tchunk_id\n")
        for exon in GTF.iterator(IOTools.openFile(infile)):
            tmpfile.write("\t".join(
                [exon.gene_id, re.sub(";", "", exon["exon_id"])]) + "\n")
        tmpfn = tmpfile.name

    P.load(tmpfn, outfile, options="-i gene_id -i exon_id")
    os.unlink(tmpfn)

Esempio n. 50

0

Mostra file

File: pipeline_variant_annotation.py Progetto: CGATOxford/CGATPipelines

def mergeEffects(infiles, outfile):
    '''load transcript effects into single table.'''

    tablename = P.toTable(outfile)
    outf = open('effects.txt', 'w')
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".effects.gz")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [x for x in gzip.open(f, "r").readlines()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))
    outf.close()

    P.load("effect.txt",
           outfile,
           options="--add-index=transcript_id")

    for suffix in ("cds", "intron", "splicing", "translation", "genes"):

        outf = open('effects.' + suffix + '.txt', 'w')
        first = True
        for f in infiles:
            track = P.snip(os.path.basename(f), ".effects.gz")
            statfile = f + "." + suffix + ".gz"
            print(statfile)
            if not os.path.exists(statfile):
                E.warn("File %s missing" % statfile)
                continue
            lines = [x for x in gzip.open(statfile, "r").readlines()]
            if first:
                outf.write("%s\t%s" % ("track", lines[0]))
            first = False
            for i in range(1, len(lines)):
                outf.write("%s\t%s" % (track, lines[i]))
        outf.close()
        tmpfilename = outf.name

        P.load(outf.name,
               outfile,
               tablename=tabelname + "_" + suffix,
               options="--add-index=transcript_id "
               "--allow-empty-file "
               "--ignore-column=seq_na "
               "--ignore-column=seq_aa")

Esempio n. 51

0

Mostra file

File: PipelineBamStats.py Progetto: CGATOxford/CGATPipelines

def loadCountReads(infiles, outfile,
                   suffix="nreads",
                   pipeline_suffix=".nreads",
                   tablename=None):
    '''load read counts.
    Arguments
    ---------
    infiles : string
        Filenames of files with number of reads per sample. Each file
        corresponds to a different track.
    outfile : string
        Logfile.
    suffix : string
        Suffix to append to table name.
    pipeline_suffix : string
        Suffix to remove from track name.
    tablename : string
        Tablename to use. If unset, the table name will be derived
        from `outfile` and suffix as ``toTable(outfile) + "_" +
        suffix``.
    '''

    if not tablename:
        tablename = "%s_%s" % (P.toTable(outfile), suffix)

    outf = P.getTempFile(".")

    outf.write("%s\t%s\n" % ("track", "nreads"))

    for filename in infiles:
        track = P.snip(os.path.basename(filename), pipeline_suffix)

        if not os.path.exists(filename):
            E.warn("File %s missing" % filename)
            continue

        lines = IOTools.openFile(filename, "r").readlines()

        for line in lines:
            count = line.split("\t")[1]
            outf.write("%s\t%s\n" % (track, count))

    outf.close()

    P.load(infile=outf.name,
           outfile=outfile,
           tablename=tablename,
           options="--add-index=track")

    os.unlink(outf.name)

Esempio n. 52

0

Mostra file

File: pipeline_splicing.py Progetto: wbyu/CGATPipelines

def loadMATS(infile, outfile):
    '''load RMATS results into relational database

    Loads rMATS results into relational database.
    Continues if table empty.

    Parameters
    ----------
    infile: term:`tsv` file containing one type of rMATS results.
    outfile: .load file
    '''
    try:
        P.load(infile, outfile)
    except:
        P.touch(outfile)

Esempio n. 53

0

Mostra file

File: pipeline_iCLIP.py Progetto: shulp2211/UMIpipe

def loadClusterCounts(infiles, outfile):
    '''Find the number of signficant clusters found in each sample'''

    tmp = P.getTempFilename(shared=True)
    results = []
    for infile in infiles:
        count = IOTools.getNumLines(infile)
        method, track = re.match(
            "dedup_(.+).dir/(.+)\.clusters.bedgraph", infile).groups()
        results.append((method, track, count))
        
    IOTools.writeLines(tmp, results, header=["method", "track", "count"])

    P.load(tmp, outfile)
    os.unlink(tmp)

Esempio n. 54

0

Mostra file

File: pipeline_splicing.py Progetto: CGATOxford/CGATPipelines

def loadMATS(infile, outfile):
    '''load RMATS results into relational database

    Loads rMATS results into relational database.
    Continues if table empty.

    Parameters
    ----------
    infile: term:`tsv` file containing one type of rMATS results.
    outfile: .load file
    '''
    try:
        P.load(infile, outfile)
    except:
        P.touch(outfile)

Esempio n. 55

0

Mostra file

File: pipeline_motifs.py Progetto: wbyu/CGATPipelines

def loadMemeSummary(infiles, outfile):
    '''load information about motifs into database.'''

    outf = P.getTempFile(".")

    outf.write("track\n")

    for infile in infiles:
        if IOTools.isEmpty(infile):
            continue
        motif = P.snip(infile, ".meme")
        outf.write("%s\n" % motif)

    outf.close()

    P.load(outf.name, outfile)

    os.unlink(outf.name)

Esempio n. 56

0

Mostra file

File: pipeline_motifs.py Progetto: gjaime/CGATPipelines

def loadMotifInformation(infiles, outfile):
    '''load information about motifs into database.'''

    outf = P.getTempFile(".")

    outf.write("motif\n")

    for infile in infiles:
        if IOTools.isEmpty(infile):
            continue
        motif = P.snip(infile, ".motif")
        outf.write("%s\n" % motif)

    outf.close()

    P.load(outf.name, outfile, "--allow-empty-file")

    os.unlink(outf.name)

Esempio n. 57

0

Mostra file

File: pipeline_motifs.py Progetto: gjaime/CGATPipelines

def loadMemeSummary(infiles, outfile):
    '''load information about motifs into database.'''

    outf = P.getTempFile(".")

    outf.write("track\n")

    for infile in infiles:
        if IOTools.isEmpty(infile):
            continue
        motif = P.snip(infile, ".meme")
        outf.write("%s\n" % motif)

    outf.close()

    P.load(outf.name, outfile)

    os.unlink(outf.name)

Esempio n. 58

0

Mostra file

def loadMotifInformation(infiles, outfile):
    '''load information about motifs into database.'''

    outf = P.getTempFile(".")

    outf.write("motif\n")

    for infile in infiles:
        if IOTools.isEmpty(infile):
            continue
        motif = P.snip(infile, ".motif")
        outf.write("%s\n" % motif)

    outf.close()

    P.load(outf.name, outfile, "--allow-empty-file")

    os.unlink(outf.name)